From 60452a30f5971c54ecdcd1bd389c1cabbbc844f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:27 +0200 Subject: grep: break down an "if" stmt in preparation for next changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/grep.c b/grep.c index 7b2b96a..f430d7e 100644 --- a/grep.c +++ b/grep.c @@ -403,7 +403,9 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; - if (opt->fixed || is_fixed(p->pattern, p->patternlen)) + if (opt->fixed) + p->fixed = 1; + else if (is_fixed(p->pattern, p->patternlen)) p->fixed = 1; else p->fixed = 0; -- cgit v0.10.2-6-g49f6 From 949782d860889cbd50edfdf8c00e91d7a4f6cb05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:28 +0200 Subject: test-regex: isolate the bug test code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is in preparation to turn test-regex into some generic regex testing command. Helped-by: Eric Sunshine Helped-by: Ramsay Jones Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/t/t0070-fundamental.sh b/t/t0070-fundamental.sh index 5ed69a6..991ed2a 100755 --- a/t/t0070-fundamental.sh +++ b/t/t0070-fundamental.sh @@ -31,7 +31,7 @@ test_expect_success 'git_mkstemps_mode does not fail if fd 0 is not open' ' test_expect_success 'check for a bug in the regex routines' ' # if this test fails, re-build git with NO_REGEX=1 - test-regex + test-regex --bug ' test_done diff --git a/test-regex.c b/test-regex.c index 0dc598e..67a1a65 100644 --- a/test-regex.c +++ b/test-regex.c @@ -1,6 +1,6 @@ #include "git-compat-util.h" -int main(int argc, char **argv) +static int test_regex_bug(void) { char *pat = "[^={} \t]+"; char *str = "={}\nfred"; @@ -16,5 +16,13 @@ int main(int argc, char **argv) if (m[0].rm_so == 3) /* matches '\n' when it should not */ die("regex bug confirmed: re-build git with NO_REGEX=1"); - exit(0); + return 0; +} + +int main(int argc, char **argv) +{ + if (argc == 2 && !strcmp(argv[1], "--bug")) + return test_regex_bug(); + else + usage("test-regex --bug"); } -- cgit v0.10.2-6-g49f6 From d8acfe1eaf8621025c4095d3fbb88b2703b3fa54 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:29 +0200 Subject: test-regex: expose full regcomp() to the command line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/test-regex.c b/test-regex.c index 67a1a65..eff26f5 100644 --- a/test-regex.c +++ b/test-regex.c @@ -1,4 +1,21 @@ #include "git-compat-util.h" +#include "gettext.h" + +struct reg_flag { + const char *name; + int flag; +}; + +static struct reg_flag reg_flags[] = { + { "EXTENDED", REG_EXTENDED }, + { "NEWLINE", REG_NEWLINE }, + { "ICASE", REG_ICASE }, + { "NOTBOL", REG_NOTBOL }, +#ifdef REG_STARTEND + { "STARTEND", REG_STARTEND }, +#endif + { NULL, 0 } +}; static int test_regex_bug(void) { @@ -21,8 +38,38 @@ static int test_regex_bug(void) int main(int argc, char **argv) { + const char *pat; + const char *str; + int flags = 0; + regex_t r; + regmatch_t m[1]; + if (argc == 2 && !strcmp(argv[1], "--bug")) return test_regex_bug(); - else - usage("test-regex --bug"); + else if (argc < 3) + usage("test-regex --bug\n" + "test-regex []"); + + argv++; + pat = *argv++; + str = *argv++; + while (*argv) { + struct reg_flag *rf; + for (rf = reg_flags; rf->name; rf++) + if (!strcmp(*argv, rf->name)) { + flags |= rf->flag; + break; + } + if (!rf->name) + die("do not recognize %s", *argv); + argv++; + } + git_setup_gettext(); + + if (regcomp(&r, pat, flags)) + die("failed regcomp() for pattern '%s'", pat); + if (regexec(&r, str, 1, m, 0)) + return 1; + + return 0; } -- cgit v0.10.2-6-g49f6 From 5c1ebcca4d1df3b2a84d01b3f32ec13bf8f760f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:30 +0200 Subject: grep/icase: avoid kwsset on literal non-ascii strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we detect the pattern is just a literal string, we avoid heavy regex engine and use fast substring search implemented in kwsset.c. But kws uses git-ctype which is locale-independent so it does not know how to fold case properly outside ascii range. Let regcomp or pcre take care of this case instead. Slower, but accurate. Noticed-by: Plamen Totev Helped-by: René Scharfe Helped-by: Eric Sunshine Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/grep.c b/grep.c index f430d7e..451275d 100644 --- a/grep.c +++ b/grep.c @@ -4,6 +4,7 @@ #include "xdiff-interface.h" #include "diff.h" #include "diffcore.h" +#include "commit.h" static int grep_source_load(struct grep_source *gs); static int grep_source_is_binary(struct grep_source *gs); @@ -398,14 +399,18 @@ static int is_fixed(const char *s, size_t len) static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { + int icase, ascii_only; int err; p->word_regexp = opt->word_regexp; p->ignore_case = opt->ignore_case; + icase = opt->regflags & REG_ICASE || p->ignore_case; + ascii_only = !has_non_ascii(p->pattern); if (opt->fixed) p->fixed = 1; - else if (is_fixed(p->pattern, p->patternlen)) + else if ((!icase || ascii_only) && + is_fixed(p->pattern, p->patternlen)) p->fixed = 1; else p->fixed = 0; diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh new file mode 100755 index 0000000..b78a774 --- /dev/null +++ b/t/t7812-grep-icase-non-ascii.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +test_description='grep icase on non-English locales' + +. ./lib-gettext.sh + +test_expect_success GETTEXT_LOCALE 'setup' ' + test_write_lines "TILRAUN: Halló Heimur!" >file && + git add file && + LC_ALL="$is_IS_locale" && + export LC_ALL +' + +test_have_prereq GETTEXT_LOCALE && +test-regex "HALLÓ" "Halló" ICASE && +test_set_prereq REGEX_LOCALE + +test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' + git grep -i "TILRAUN: Halló Heimur!" && + git grep -i "TILRAUN: HALLÓ HEIMUR!" +' + +test_done -- cgit v0.10.2-6-g49f6 From 793dc676e08394ed15bffe0ed66606ff9ced1c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:31 +0200 Subject: grep/icase: avoid kwsset when -F is specified MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to the previous commit, we can't use kws on icase search outside ascii range. But we can't simply pass the pattern to regcomp/pcre like the previous commit because it may contain regex special characters, so we need to quote the regex first. To avoid misquote traps that could lead to undefined behavior, we always stick to basic regex engine in this case. We don't need fancy features for grepping a literal string anyway. basic_regex_quote_buf() assumes that if the pattern is in a multibyte encoding, ascii chars must be unambiguously encoded as single bytes. This is true at least for UTF-8. For others, let's wait until people yell up. Chances are nobody uses multibyte, non utf-8 charsets anymore. Noticed-by: Plamen Totev Helped-by: René Scharfe Helped-by: Eric Sunshine Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/grep.c b/grep.c index 451275d..627ae3e 100644 --- a/grep.c +++ b/grep.c @@ -5,6 +5,7 @@ #include "diff.h" #include "diffcore.h" #include "commit.h" +#include "quote.h" static int grep_source_load(struct grep_source *gs); static int grep_source_is_binary(struct grep_source *gs); @@ -397,6 +398,28 @@ static int is_fixed(const char *s, size_t len) return 1; } +static void compile_fixed_regexp(struct grep_pat *p, struct grep_opt *opt) +{ + struct strbuf sb = STRBUF_INIT; + int err; + int regflags; + + basic_regex_quote_buf(&sb, p->pattern); + regflags = opt->regflags & ~REG_EXTENDED; + if (opt->ignore_case) + regflags |= REG_ICASE; + err = regcomp(&p->regexp, sb.buf, regflags); + if (opt->debug) + fprintf(stderr, "fixed %s\n", sb.buf); + strbuf_release(&sb); + if (err) { + char errbuf[1024]; + regerror(err, &p->regexp, errbuf, sizeof(errbuf)); + regfree(&p->regexp); + compile_regexp_failed(p, errbuf); + } +} + static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) { int icase, ascii_only; @@ -407,8 +430,20 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) icase = opt->regflags & REG_ICASE || p->ignore_case; ascii_only = !has_non_ascii(p->pattern); + /* + * Even when -F (fixed) asks us to do a non-regexp search, we + * may not be able to correctly case-fold when -i + * (ignore-case) is asked (in which case, we'll synthesize a + * regexp to match the pattern that matches regexp special + * characters literally, while ignoring case differences). On + * the other hand, even without -F, if the pattern does not + * have any regexp special characters and there is no need for + * case-folding search, we can internally turn it into a + * simple string match using kws. p->fixed tells us if we + * want to use kws. + */ if (opt->fixed) - p->fixed = 1; + p->fixed = !icase || ascii_only; else if ((!icase || ascii_only) && is_fixed(p->pattern, p->patternlen)) p->fixed = 1; @@ -423,6 +458,14 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) kwsincr(p->kws, p->pattern, p->patternlen); kwsprep(p->kws); return; + } else if (opt->fixed) { + /* + * We come here when the pattern has the non-ascii + * characters we cannot case-fold, and asked to + * ignore-case. + */ + compile_fixed_regexp(p, opt); + return; } if (opt->pcre) { diff --git a/quote.c b/quote.c index fe884d2..c67adb7 100644 --- a/quote.c +++ b/quote.c @@ -440,3 +440,40 @@ void tcl_quote_buf(struct strbuf *sb, const char *src) } strbuf_addch(sb, '"'); } + +void basic_regex_quote_buf(struct strbuf *sb, const char *src) +{ + char c; + + if (*src == '^') { + /* only beginning '^' is special and needs quoting */ + strbuf_addch(sb, '\\'); + strbuf_addch(sb, *src++); + } + if (*src == '*') + /* beginning '*' is not special, no quoting */ + strbuf_addch(sb, *src++); + + while ((c = *src++)) { + switch (c) { + case '[': + case '.': + case '\\': + case '*': + strbuf_addch(sb, '\\'); + strbuf_addch(sb, c); + break; + + case '$': + /* only the end '$' is special and needs quoting */ + if (*src == '\0') + strbuf_addch(sb, '\\'); + strbuf_addch(sb, c); + break; + + default: + strbuf_addch(sb, c); + break; + } + } +} diff --git a/quote.h b/quote.h index 99e04d3..362d315 100644 --- a/quote.h +++ b/quote.h @@ -67,5 +67,6 @@ extern char *quote_path_relative(const char *in, const char *prefix, extern void perl_quote_buf(struct strbuf *sb, const char *src); extern void python_quote_buf(struct strbuf *sb, const char *src); extern void tcl_quote_buf(struct strbuf *sb, const char *src); +extern void basic_regex_quote_buf(struct strbuf *sb, const char *src); #endif diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index b78a774..1929809 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -20,4 +20,30 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' git grep -i "TILRAUN: HALLÓ HEIMUR!" ' +test_expect_success REGEX_LOCALE 'grep literal string, with -F' ' + git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null | + grep fixed >debug1 && + test_write_lines "fixed TILRAUN: Halló Heimur!" >expect1 && + test_cmp expect1 debug1 && + + git grep --debug -i -F "TILRAUN: HALLÓ HEIMUR!" 2>&1 >/dev/null | + grep fixed >debug2 && + test_write_lines "fixed TILRAUN: HALLÓ HEIMUR!" >expect2 && + test_cmp expect2 debug2 +' + +test_expect_success REGEX_LOCALE 'grep string with regex, with -F' ' + test_write_lines "^*TILR^AUN:.* \\Halló \$He[]imur!\$" >file && + + git grep --debug -i -F "^*TILR^AUN:.* \\Halló \$He[]imur!\$" 2>&1 >/dev/null | + grep fixed >debug1 && + test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\Halló \$He\\[]imur!\\\$" >expect1 && + test_cmp expect1 debug1 && + + git grep --debug -i -F "^*TILR^AUN:.* \\HALLÓ \$HE[]IMUR!\$" 2>&1 >/dev/null | + grep fixed >debug2 && + test_write_lines "fixed \\^*TILR^AUN:\\.\\* \\\\HALLÓ \$HE\\[]IMUR!\\\$" >expect2 && + test_cmp expect2 debug2 +' + test_done -- cgit v0.10.2-6-g49f6 From e944d9d932b3653debcfdd16eec2721eed0670e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:32 +0200 Subject: grep: rewrite an if/else condition to avoid duplicate expression MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit "!icase || ascii_only" is repeated twice in this if/else chain as this series evolves. Rewrite it (and basically revert the first if condition back to before the "grep: break down an "if" stmt..." commit). Helped-by: Junio C Hamano Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/grep.c b/grep.c index 627ae3e..6325caf 100644 --- a/grep.c +++ b/grep.c @@ -442,11 +442,8 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) * simple string match using kws. p->fixed tells us if we * want to use kws. */ - if (opt->fixed) + if (opt->fixed || is_fixed(p->pattern, p->patternlen)) p->fixed = !icase || ascii_only; - else if ((!icase || ascii_only) && - is_fixed(p->pattern, p->patternlen)) - p->fixed = 1; else p->fixed = 0; -- cgit v0.10.2-6-g49f6 From 9d9babb84d45234132f3cb1f4527ce1106e3d2ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:33 +0200 Subject: grep/pcre: prepare locale-dependent tables for icase matching MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default tables are usually built with C locale and only suitable for LANG=C or similar. This should make case insensitive search work correctly for all single-byte charsets. Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/grep.c b/grep.c index 6325caf..af920c4 100644 --- a/grep.c +++ b/grep.c @@ -324,11 +324,14 @@ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt) int erroffset; int options = PCRE_MULTILINE; - if (opt->ignore_case) + if (opt->ignore_case) { + if (has_non_ascii(p->pattern)) + p->pcre_tables = pcre_maketables(); options |= PCRE_CASELESS; + } p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset, - NULL); + p->pcre_tables); if (!p->pcre_regexp) compile_regexp_failed(p, error); @@ -362,6 +365,7 @@ static void free_pcre_regexp(struct grep_pat *p) { pcre_free(p->pcre_regexp); pcre_free(p->pcre_extra_info); + pcre_free((void *)p->pcre_tables); } #else /* !USE_LIBPCRE */ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt) diff --git a/grep.h b/grep.h index 95f197a..cee4357 100644 --- a/grep.h +++ b/grep.h @@ -48,6 +48,7 @@ struct grep_pat { regex_t regexp; pcre *pcre_regexp; pcre_extra *pcre_extra_info; + const unsigned char *pcre_tables; kwset_t kws; unsigned fixed:1; unsigned ignore_case:1; diff --git a/t/t7813-grep-icase-iso.sh b/t/t7813-grep-icase-iso.sh new file mode 100755 index 0000000..efef7fb --- /dev/null +++ b/t/t7813-grep-icase-iso.sh @@ -0,0 +1,19 @@ +#!/bin/sh + +test_description='grep icase on non-English locales' + +. ./lib-gettext.sh + +test_expect_success GETTEXT_ISO_LOCALE 'setup' ' + printf "TILRAUN: Hall Heimur!" >file && + git add file && + LC_ALL="$is_IS_iso_locale" && + export LC_ALL +' + +test_expect_success GETTEXT_ISO_LOCALE,LIBPCRE 'grep pcre string' ' + git grep --perl-regexp -i "TILRAUN: H.ll Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.LL HEIMUR!" +' + +test_done -- cgit v0.10.2-6-g49f6 From e8c1672655dcff59aaf0f78fa256b1d2e3f1ba9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:34 +0200 Subject: gettext: add is_utf8_locale() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This function returns true if git is running under an UTF-8 locale. pcre in the next patch will need this. is_encoding_utf8() is used instead of strcmp() to catch both "utf-8" and "utf8" suffixes. When built with no gettext support, we peek in several env variables to detect UTF-8. pcre library might support utf-8 even if libc is built without locale support.. The peeking code is a copy from compat/regex/regcomp.c Helped-by: Ævar Arnfjörð Bjarmason Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/gettext.c b/gettext.c index a268a2c..db727ea 100644 --- a/gettext.c +++ b/gettext.c @@ -18,6 +18,8 @@ # endif #endif +static const char *charset; + /* * Guess the user's preferred languages from the value in LANGUAGE environment * variable and LC_MESSAGES locale category if NO_GETTEXT is not defined. @@ -65,7 +67,6 @@ static int test_vsnprintf(const char *fmt, ...) return ret; } -static const char *charset; static void init_gettext_charset(const char *domain) { /* @@ -172,8 +173,27 @@ int gettext_width(const char *s) { static int is_utf8 = -1; if (is_utf8 == -1) - is_utf8 = !strcmp(charset, "UTF-8"); + is_utf8 = is_utf8_locale(); return is_utf8 ? utf8_strwidth(s) : strlen(s); } #endif + +int is_utf8_locale(void) +{ +#ifdef NO_GETTEXT + if (!charset) { + const char *env = getenv("LC_ALL"); + if (!env || !*env) + env = getenv("LC_CTYPE"); + if (!env || !*env) + env = getenv("LANG"); + if (!env) + env = ""; + if (strchr(env, '.')) + env = strchr(env, '.') + 1; + charset = xstrdup(env); + } +#endif + return is_encoding_utf8(charset); +} diff --git a/gettext.h b/gettext.h index 33696a4..7eee64a 100644 --- a/gettext.h +++ b/gettext.h @@ -90,5 +90,6 @@ const char *Q_(const char *msgid, const char *plu, unsigned long n) #endif const char *get_preferred_languages(void); +extern int is_utf8_locale(void); #endif -- cgit v0.10.2-6-g49f6 From 18547aacf5ced88bf83d85b03b2b7b3fc6aa3f6c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:35 +0200 Subject: grep/pcre: support utf-8 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the previous change in this function, we add locale support for single-byte encodings only. It looks like pcre only supports utf-* as multibyte encodings, the others are left in the cold (which is fine). We need to enable PCRE_UTF8 so pcre can find character boundary correctly. It's needed for case folding (when --ignore-case is used) or '*', '+' or similar syntax is used. The "has_non_ascii()" check is to be on the conservative side. If there's non-ascii in the pattern, the searched content could still be in utf-8, but we can treat it just like a byte stream and everything should work. If we force utf-8 based on locale only and pcre validates utf-8 and the file content is in non-utf8 encoding, things break. Noticed-by: Plamen Totev Helped-by: Plamen Totev Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/grep.c b/grep.c index af920c4..0e6511f 100644 --- a/grep.c +++ b/grep.c @@ -329,6 +329,8 @@ static void compile_pcre_regexp(struct grep_pat *p, const struct grep_opt *opt) p->pcre_tables = pcre_maketables(); options |= PCRE_CASELESS; } + if (is_utf8_locale() && has_non_ascii(p->pattern)) + options |= PCRE_UTF8; p->pcre_regexp = pcre_compile(p->pattern, options, &error, &erroffset, p->pcre_tables); diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 1929809..08ae4c9 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -20,6 +20,21 @@ test_expect_success REGEX_LOCALE 'grep literal string, no -F' ' git grep -i "TILRAUN: HALLÓ HEIMUR!" ' +test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 icase' ' + git grep --perl-regexp "TILRAUN: H.lló Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.lló Heimur!" && + git grep --perl-regexp -i "TILRAUN: H.LLÓ HEIMUR!" +' + +test_expect_success GETTEXT_LOCALE,LIBPCRE 'grep pcre utf-8 string with "+"' ' + test_write_lines "TILRAUN: Hallóó Heimur!" >file2 && + git add file2 && + git grep -l --perl-regexp "TILRAUN: H.lló+ Heimur!" >actual && + echo file >expected && + echo file2 >>expected && + test_cmp expected actual +' + test_expect_success REGEX_LOCALE 'grep literal string, with -F' ' git grep --debug -i -F "TILRAUN: Halló Heimur!" 2>&1 >/dev/null | grep fixed >debug1 && -- cgit v0.10.2-6-g49f6 From 3d5b23a36218b0417a056fa7b5e6d25d595ccaf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:36 +0200 Subject: diffcore-pickaxe: Add regcomp_or_die() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There's another regcomp code block coming in this function that needs the same error handling. This function can help avoid duplicating error handling code. Helped-by: Jeff King Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 7715c13..2093b6a 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -198,6 +198,18 @@ static void pickaxe(struct diff_queue_struct *q, struct diff_options *o, *q = outq; } +static void regcomp_or_die(regex_t *regex, const char *needle, int cflags) +{ + int err = regcomp(regex, needle, cflags); + if (err) { + /* The POSIX.2 people are surely sick */ + char errbuf[1024]; + regerror(err, regex, errbuf, 1024); + regfree(regex); + die("invalid regex: %s", errbuf); + } +} + void diffcore_pickaxe(struct diff_options *o) { const char *needle = o->pickaxe; @@ -206,18 +218,10 @@ void diffcore_pickaxe(struct diff_options *o) kwset_t kws = NULL; if (opts & (DIFF_PICKAXE_REGEX | DIFF_PICKAXE_KIND_G)) { - int err; int cflags = REG_EXTENDED | REG_NEWLINE; if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE)) cflags |= REG_ICASE; - err = regcomp(®ex, needle, cflags); - if (err) { - /* The POSIX.2 people are surely sick */ - char errbuf[1024]; - regerror(err, ®ex, errbuf, 1024); - regfree(®ex); - die("invalid regex: %s", errbuf); - } + regcomp_or_die(®ex, needle, cflags); regexp = ®ex; } else { kws = kwsalloc(DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE) -- cgit v0.10.2-6-g49f6 From b51a9c1479645b3e0c7d5156d027a97a4bb87977 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:37 +0200 Subject: diffcore-pickaxe: support case insensitive match on non-ascii MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Similar to the "grep -F -i" case, we can't use kws on icase search outside ascii range, so we quote the string and pass it to regcomp as a basic regexp and let regex engine deal with case sensitivity. The new test is put in t7812 instead of t4209-log-pickaxe because lib-gettext.sh might cause problems elsewhere, probably. Noticed-by: Plamen Totev Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/diffcore-pickaxe.c b/diffcore-pickaxe.c index 2093b6a..55067ca 100644 --- a/diffcore-pickaxe.c +++ b/diffcore-pickaxe.c @@ -7,6 +7,8 @@ #include "diffcore.h" #include "xdiff-interface.h" #include "kwset.h" +#include "commit.h" +#include "quote.h" typedef int (*pickaxe_fn)(mmfile_t *one, mmfile_t *two, struct diff_options *o, @@ -223,6 +225,15 @@ void diffcore_pickaxe(struct diff_options *o) cflags |= REG_ICASE; regcomp_or_die(®ex, needle, cflags); regexp = ®ex; + } else if (DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE) && + has_non_ascii(needle)) { + struct strbuf sb = STRBUF_INIT; + int cflags = REG_NEWLINE | REG_ICASE; + + basic_regex_quote_buf(&sb, needle); + regcomp_or_die(®ex, sb.buf, cflags); + strbuf_release(&sb); + regexp = ®ex; } else { kws = kwsalloc(DIFF_OPT_TST(o, PICKAXE_IGNORE_CASE) ? tolower_trans_tbl : NULL); diff --git a/t/t7812-grep-icase-non-ascii.sh b/t/t7812-grep-icase-non-ascii.sh index 08ae4c9..169fd8d 100755 --- a/t/t7812-grep-icase-non-ascii.sh +++ b/t/t7812-grep-icase-non-ascii.sh @@ -61,4 +61,11 @@ test_expect_success REGEX_LOCALE 'grep string with regex, with -F' ' test_cmp expect2 debug2 ' +test_expect_success REGEX_LOCALE 'pickaxe -i on non-ascii' ' + git commit -m first && + git log --format=%f -i -S"TILRAUN: HALLÓ HEIMUR!" >actual && + echo first >expected && + test_cmp expected actual +' + test_done -- cgit v0.10.2-6-g49f6 From 695f95ba5dc4ab44f327574f85c3ebe7ebf449b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nguy=E1=BB=85n=20Th=C3=A1i=20Ng=E1=BB=8Dc=20Duy?= Date: Sat, 25 Jun 2016 07:22:38 +0200 Subject: grep.c: reuse "icase" variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Nguyễn Thái Ngọc Duy Signed-off-by: Junio C Hamano diff --git a/grep.c b/grep.c index 0e6511f..906406a 100644 --- a/grep.c +++ b/grep.c @@ -454,10 +454,7 @@ static void compile_regexp(struct grep_pat *p, struct grep_opt *opt) p->fixed = 0; if (p->fixed) { - if (opt->regflags & REG_ICASE || p->ignore_case) - p->kws = kwsalloc(tolower_trans_tbl); - else - p->kws = kwsalloc(NULL); + p->kws = kwsalloc(icase ? tolower_trans_tbl : NULL); kwsincr(p->kws, p->pattern, p->patternlen); kwsprep(p->kws); return; -- cgit v0.10.2-6-g49f6