CVE-2018-6797.diff   [plain text]


From 7510b7556ab38224572ba14dc7b279e094c0b039 Mon Sep 17 00:00:00 2001
From: Karl Williamson <khw@cpan.org>
Date: Mon, 19 Mar 2018 22:09:30 -0600
Subject: [PATCH 2/3] Fix [CVE-2018-6797] heap-buffer-overflow in_regatom

This is a special 5.18 mainentance patch for [perl #132227].

This is caused by the German SHARP S, U+DF, under /ui rules folding to
two characters 'ss' instead of one, whereas under /di rules it requires
only one character.  If something causes the /di to change to /ui after
the sizing pass starts, the needed space calculated will be too small.

The solution adopted here, given the age of the release, is to simply
set a flag if one of these U+DF characters is encountered under /di
rules.  At the end of the sizing pass, that flag is checked along with
another existing flag that indicates things changed from /d to /u.  This
indicates to the code that it must redo the first pass.
---
 regcomp.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/regcomp.c b/regcomp.c
index 0841f17..ed16e24 100644
--- a/regcomp.c
+++ b/regcomp.c
@@ -171,6 +171,9 @@ typedef struct RExC_state_t {
 #define RExC_lastnum	(pRExC_state->lastnum)
 #define RExC_paren_name_list    (pRExC_state->paren_name_list)
 #endif
+    bool        seen_unfolded_sharp_s;
+#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
+
 } RExC_state_t;
 
 #define RExC_flags	(pRExC_state->flags)
@@ -183,6 +186,7 @@ typedef struct RExC_state_t {
 #define RExC_end	(pRExC_state->end)
 #define RExC_parse	(pRExC_state->parse)
 #define RExC_whilem_seen	(pRExC_state->whilem_seen)
+#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
 #ifdef RE_TRACK_PATTERN_OFFSETS
 #define RExC_offsets	(pRExC_state->rxi->u.offsets) /* I am not like the others */
 #endif
@@ -5681,6 +5685,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     /* ignore the utf8ness if the pattern is 0 length */
     RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
     RExC_uni_semantics = 0;
+    RExC_seen_unfolded_sharp_s = 0;
     RExC_contains_locale = 0;
     pRExC_state->runtime_code_qr = NULL;
 
@@ -5692,8 +5697,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         });
 
   redo_first_pass:
-    /* we jump here if we upgrade the pattern to utf8 and have to
-     * recompile */
+    /* we jump here if we upgrade the pattern to utf8 and have to recompile, or
+     * if convert to unicode rules and the size may have changed. */
 
     if ((pm_flags & PMf_USE_RE_EVAL)
 		/* this second condition covers the non-regex literal case,
@@ -5727,11 +5732,18 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     if (initial_charset == REGEX_LOCALE_CHARSET) {
 	RExC_contains_locale = 1;
     }
-    else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
-
-	/* Set to use unicode semantics if the pattern is in utf8 and has the
-	 * 'depends' charset specified, as it means unicode when utf8  */
+    else if (        initial_charset == REGEX_DEPENDS_CHARSET
+             && (    RExC_utf8
+                 || (RExC_seen_unfolded_sharp_s && RExC_uni_semantics)))
+    {
+        /* Set to use unicode semantics if the 'depends' charset was specified,
+         * and either the pattern is in utf8 (which means it has to be unicode)
+         * or we are redoing the first pass and we saw an unfolded sharp_s in
+         * it, and the rules changed during the pass to be uni ones.  (This
+         * last clause in the 'if' can't be true here unless we're redoing
+         * pass1 */
 	set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
+        RExC_seen_unfolded_sharp_s = 0;
     }
 
     RExC_precomp = exp;
@@ -5818,6 +5830,15 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
         }
         Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#X", flags);
     }
+
+    /* We have to redo the first pass to get correct sizing information if
+     * there were any sharp s's that didn't get folded, and we now are using
+     * unicode rules. This happens when the first pass starts out without uni
+     * rules, and something in it forces the change */
+    if (RExC_seen_unfolded_sharp_s && RExC_uni_semantics) {
+        goto redo_first_pass;
+    }
+
     if (code_blocksv)
 	SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
 
@@ -11117,6 +11138,9 @@ tryagain:
                     else {
                         *(s++) = (char) ender;
                         maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+                        if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+                            RExC_seen_unfolded_sharp_s = 1;
+                        }
                     }
 		}
 		else if (UTF) {
-- 
2.8.4 (Apple Git-73)