Last active
August 29, 2015 13:57
-
-
Save aristus/9688925 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- ext/standard/string.c (revision 301280) | |
+++ ext/standard/string.c (working copy) | |
@@ -1674,6 +1674,105 @@ | |
} | |
/* }}} */ | |
+#define FASTSEARCH_MIN 50 | |
+ | |
+/* | |
+ * Implementation of "fastsearch" algorithm devised by | |
+ * Fredrik Lundh: http://effbot.org/zone/stringlib.htm | |
+ * Adapted from Python-2.7/Objects/stringlib/fastsearch.h | |
+ * | |
+ * Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python | |
+ * Software Foundation; All Rights Reserved. | |
+ * Released under the Python Software Foundation License v2 | |
+ */ | |
+ | |
+/* Poor man's Bloom filter */ | |
+ | |
+#if SIZEOF_LONG == 4 && defined(HAVE_ZEND_LONG64) | |
+# define PHP_BLOOM_WIDTH 64 | |
+#else | |
+# define PHP_BLOOM_WIDTH 32 | |
+#endif | |
+ | |
+#define PHP_BLOOM_ADD(mask, ch) \ | |
+ ((mask |= (1UL << ((ch) & (PHP_BLOOM_WIDTH -1))))) | |
+#define PHP_BLOOM(mask, ch) \ | |
+ ((mask & (1UL << ((ch) & (PHP_BLOOM_WIDTH -1))))) | |
+ | |
+/* {{{ This is a fast search code based on the Python search method */ | |
+static char * | |
+php_fastsearch(char *haystack, char *needle, int needle_len, char *end) | |
+{ | |
+ char *p = haystack; | |
+ int haystack_len = end - haystack; | |
+ unsigned long mask; | |
+ int skip; | |
+ int j, mlast; | |
+ size_t i; | |
+ | |
+ /* Special case one-byte needles & needles longer than haystack */ | |
+ if (needle_len == 1) { | |
+ return (char *)memchr(p, *needle, (end-p)); | |
+ } | |
+ if (needle_len > haystack_len) { | |
+ return NULL; | |
+ } | |
+ | |
+ mlast = needle_len - 1; | |
+ skip = mlast - 1; | |
+ end -= needle_len; | |
+ | |
+ /* Process pattern up to penultimate char. Also calculate the | |
+ * longest safe skip distance in lieu of a proper skip table. | |
+ */ | |
+ for (i = 0; i < mlast; i++) { | |
+ PHP_BLOOM_ADD(mask, needle[i]); | |
+ if (needle[i] == needle[mlast]) { | |
+ skip = mlast - i - 1; | |
+ } | |
+ } | |
+ /* Add pattern[-1] to mask */ | |
+ PHP_BLOOM_ADD(mask, needle[mlast]); | |
+ | |
+ for (i = 0; i <= haystack_len; i++) { | |
+ /* Note: using mlast in the skip path slows things down on x86 */ | |
+ if (haystack[i+needle_len-1] == needle[needle_len-1]) { | |
+ | |
+ /* Candidate match, scan the rest | |
+ * todo: memcmp() ? */ | |
+ for (j = 0; j < mlast; j++) { | |
+ if (haystack[i+j] != needle[j]) { | |
+ break; | |
+ } | |
+ } | |
+ | |
+ if (j == mlast) { | |
+ return (char *)(p + i); | |
+ } | |
+ | |
+ /* Mismatch: check if next character is part of pattern */ | |
+ if (!PHP_BLOOM(mask, haystack[i+needle_len])) { | |
+ i = i + needle_len; | |
+ } else { | |
+ i = i + skip; | |
+ } | |
+ } else { | |
+ /* Opportunistic skip: the current char is not equal to the | |
+ * last pattern char. Check if *next* character is part of | |
+ * the pattern. If not, skip away. | |
+ */ | |
+ if (!PHP_BLOOM(mask, haystack[i+needle_len])) { | |
+ i = i + needle_len; | |
+ } | |
+ } | |
+ } | |
+ | |
+ /* No match. How sad. */ | |
+ return NULL; | |
+} | |
+/* }}} */ | |
+/* End Python Fast Search Code */ | |
+ | |
/* {{{ proto string strstr(string haystack, string needle[, bool part]) | |
Finds first occurrence of a string within another */ | |
PHP_FUNCTION(strstr) | |
@@ -1696,14 +1795,25 @@ | |
RETURN_FALSE; | |
} | |
- found = php_memnstr(haystack, Z_STRVAL_P(needle), Z_STRLEN_P(needle), haystack + haystack_len); | |
+ if (haystack_len >= FASTSEARCH_MIN) { | |
+ found = php_fastsearch(haystack, | |
+ Z_STRVAL_P(needle), | |
+ Z_STRLEN_P(needle), | |
+ haystack + haystack_len); | |
+ } else { | |
+ found = php_memnstr(haystack, | |
+ Z_STRVAL_P(needle), | |
+ Z_STRLEN_P(needle), | |
+ haystack + haystack_len); | |
+ } | |
} else { | |
if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) { | |
RETURN_FALSE; | |
} | |
needle_char[1] = 0; | |
- | |
- found = php_memnstr(haystack, needle_char, 1, haystack + haystack_len); | |
+ found = php_memnstr(haystack, | |
+ needle_char, 1, | |
+ haystack + haystack_len); | |
} | |
if (found) { | |
@@ -1724,6 +1834,7 @@ | |
/* {{{ proto int strpos(string haystack, string needle [, int offset]) | |
Finds position of first occurrence of a string within another */ | |
+ | |
PHP_FUNCTION(strpos) | |
{ | |
zval *needle; | |
@@ -1732,7 +1843,7 @@ | |
char needle_char[2]; | |
long offset = 0; | |
int haystack_len; | |
- | |
+ | |
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "sz|l", &haystack, &haystack_len, &needle, &offset) == FAILURE) { | |
return; | |
} | |
@@ -1748,20 +1859,26 @@ | |
RETURN_FALSE; | |
} | |
- found = php_memnstr(haystack + offset, | |
+ if (haystack_len >= FASTSEARCH_MIN) { | |
+ found = php_fastsearch(haystack + offset, | |
Z_STRVAL_P(needle), | |
Z_STRLEN_P(needle), | |
haystack + haystack_len); | |
+ } else { | |
+ found = php_memnstr(haystack + offset, | |
+ Z_STRVAL_P(needle), | |
+ Z_STRLEN_P(needle), | |
+ haystack + haystack_len); | |
+ } | |
} else { | |
if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) { | |
RETURN_FALSE; | |
} | |
needle_char[1] = 0; | |
- | |
found = php_memnstr(haystack + offset, | |
needle_char, | |
1, | |
- haystack + haystack_len); | |
+ haystack + haystack_len); | |
} | |
if (found) { | |
@@ -1772,6 +1889,7 @@ | |
} | |
/* }}} */ | |
+ | |
/* {{{ proto int stripos(string haystack, string needle [, int offset]) | |
Finds position of first occurrence of a string within another, case insensitive */ | |
PHP_FUNCTION(stripos) | |
@@ -1808,7 +1926,12 @@ | |
needle_dup = estrndup(Z_STRVAL_P(needle), Z_STRLEN_P(needle)); | |
php_strtolower(needle_dup, Z_STRLEN_P(needle)); | |
- found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len); | |
+ | |
+ if (haystack_len >= FASTSEARCH_MIN) { | |
+ found = php_fastsearch(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len); | |
+ } else { | |
+ found = php_memnstr(haystack_dup + offset, needle_dup, Z_STRLEN_P(needle), haystack_dup + haystack_len); | |
+ } | |
} else { | |
if (php_needle_char(needle, needle_char TSRMLS_CC) != SUCCESS) { | |
efree(haystack_dup); | |
@@ -1816,9 +1939,10 @@ | |
} | |
needle_char[0] = tolower(needle_char[0]); | |
needle_char[1] = '\0'; | |
- found = php_memnstr(haystack_dup + offset, | |
- needle_char, | |
- sizeof(needle_char) - 1, | |
+ | |
+ found = php_memnstr(haystack_dup + offset, | |
+ needle_char, | |
+ sizeof(needle_char) - 1, | |
haystack_dup + haystack_len); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment