dberlin · March 30, 2014 04:37
diff --git a/sse42.patch b/sse42.patch
 diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp
 index b78e9f5..f6aa264 100644
 --- a/lib/Basic/SourceManager.cpp
 +++ b/lib/Basic/SourceManager.cpp
 @@ -1200,8 +1200,11 @@ unsigned SourceManager::getPresumedColumnNumber(SourceLocation Loc,
   if (isInvalid(Loc, Invalid)) return 0;
   return getPresumedLoc(Loc).getColumn();
 }
 -
 -#ifdef __SSE2__
 +#ifdef __SSE4_2__
 +#include <nmmintrin.h>
 +#elif __AVX2__
 +#include <immintrin.h>
 +#elif __SSE2__
 #include <emmintrin.h>
 #endif
 
 @@ -1232,7 +1235,63 @@ static void ComputeLineNumbers(DiagnosticsEngine &Diag, ContentCache *FI,
     // Skip over the contents of the line.
     const unsigned char *NextBuf = (const unsigned char *)Buf;
 
 -#ifdef __SSE2__
 +#ifdef __SSE4_2__
 +    __m128i CRLF = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,'\r','\n');
 +    // First fix up the alignment to 16 bytes.
 +    while (((uintptr_t)NextBuf & (0xf)) != 0) {
 +      if (*NextBuf == '\n' || *NextBuf == '\r' || *NextBuf == '\0')
 +        goto FoundSpecialChar;
 +      ++NextBuf;
 +    }
 +    // Scan 16 byte chunks for '\r' and '\n'. Ignore '\0'.
 +    while (NextBuf+16 <= End) {
 +      const __m128i Chunk = *(const __m128i*)NextBuf;
 +      int index = _mm_cmpestri(CRLF, 2, Chunk, 16,
 +			       _SIDD_UBYTE_OPS| 
 +			       _SIDD_CMP_EQUAL_ANY|
 +			       _SIDD_MASKED_POSITIVE_POLARITY| 
 +			       _SIDD_LEAST_SIGNIFICANT);
 +      
 +      // If we found a newline, adjust the pointer and jump to the
 +      // handling code.
 +      if (index != 16) {
 +	NextBuf += index;
 +	goto FoundSpecialChar;
 +      }
 +      NextBuf += 16;
 +    }
 +#elif __AVX2__
 +    // Try to skip to the next newline using SSE instructions. This is very
 +    // performance sensitive for programs with lots of diagnostics and in -E
 +    // mode.
 +    __m128i CRs128 = _mm_set1_epi8('\r');
 +    __m256i CRs = _mm256_broadcastb_epi8(CRs128);
 +    __m128i LFs128 = _mm_set1_epi8('\n');
 +    __m256i LFs = _mm256_broadcastb_epi8(LFs128);
 +
 +    // First fix up the alignment to 32 bytes.
 +    while (((uintptr_t)NextBuf & (0x1f)) != 0) {
 +      if (*NextBuf == '\n' || *NextBuf == '\r' || *NextBuf == '\0')
 +        goto FoundSpecialChar;
 +      ++NextBuf;
 +    }
 +
 +    // Scan 32 byte chunks for '\r' and '\n'. Ignore '\0'.
 +    while (NextBuf+32 <= End) {
 +      const __m256i Chunk = *(const __m256i*)NextBuf;
 +      __m256i Cmp = _mm256_or_si256(_mm256_cmpeq_epi8(Chunk, CRs),
 +                                 _mm256_cmpeq_epi8(Chunk, LFs));
 +      unsigned Mask = _mm256_movemask_epi8(Cmp);
 +
 +      // If we found a newline, adjust the pointer and jump to the handling code.
 +      if (Mask != 0) {
 +        NextBuf += llvm::countTrailingZeros(Mask);
 +        goto FoundSpecialChar;
 +      }
 +      NextBuf += 32;
 +    }
 +    
 +#elif __SSE2__
     // Try to skip to the next newline using SSE instructions. This is very
     // performance sensitive for programs with lots of diagnostics and in -E
     // mode.
 @@ -1265,7 +1324,7 @@ static void ComputeLineNumbers(DiagnosticsEngine &Diag, ContentCache *FI,
     while (*NextBuf != '\n' && *NextBuf != '\r' && *NextBuf != '\0')
       ++NextBuf;
 
 -#ifdef __SSE2__
 +#if defined(__SSE2__) || defined(__AVX2__)
 FoundSpecialChar:
 #endif
     Offs += NextBuf-Buf;
 diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
 index 0955cc5..10805bb 100644
 --- a/lib/Lex/Lexer.cpp
 +++ b/lib/Lex/Lexer.cpp
 @@ -2249,7 +2249,9 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
   return true;
 }
 
 -#ifdef __SSE2__
 +#ifdef __AVX2__
 +#include <immintrin.h>
 +#elif __SSE2__
 #include <emmintrin.h>
 #elif __ALTIVEC__
 #include <altivec.h>
 @@ -2306,13 +2308,32 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
         // If there is a code-completion point avoid the fast scan because it
         // doesn't check for '\0'.
         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
 -      // While not aligned to a 16-byte boundary.
 -      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
 +
 +#ifdef __AVX2__
 +      const int alignment = 32;
 +#else
 +      const int alignment = 16;
 +#endif
 +      // While not properly aligned to the alignment we need
 +      while (C != '/' && ((intptr_t)CurPtr & (alignment-1)) != 0)
         C = *CurPtr++;
 
       if (C == '/') goto FoundSlash;
 -
 -#ifdef __SSE2__
 +#ifdef __AVX2__
 +      __m128i Slashes128 = _mm_set1_epi8('/');
 +      __m256i Slashes = _mm256_broadcastb_epi8(Slashes128);
 +      while (CurPtr+32 <= BufferEnd) {
 +	int cmp = _mm256_movemask_epi8(_mm256_cmpeq_epi8(*(const __m256i*)CurPtr, Slashes));
 +	 if (cmp != 0) {
 +          // Adjust the pointer to point directly after the first slash. It's
 +          // not necessary to set C here, it will be overwritten at the end of
 +          // the outer loop.
 +          CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
 +          goto FoundSlash;
 +        }
 +        CurPtr += 32;
 +      }      
 +#elif __SSE2__
       __m128i Slashes = _mm_set1_epi8('/');
       while (CurPtr+16 <= BufferEnd) {
         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
	diff --git a/lib/Basic/SourceManager.cpp b/lib/Basic/SourceManager.cpp
	index b78e9f5..f6aa264 100644
	--- a/lib/Basic/SourceManager.cpp
	+++ b/lib/Basic/SourceManager.cpp
	@@ -1200,8 +1200,11 @@ unsigned SourceManager::getPresumedColumnNumber(SourceLocation Loc,
	if (isInvalid(Loc, Invalid)) return 0;
	return getPresumedLoc(Loc).getColumn();
	}
	-
	-#ifdef __SSE2__
	+#ifdef __SSE4_2__
	+#include <nmmintrin.h>
	+#elif __AVX2__
	+#include <immintrin.h>
	+#elif __SSE2__
	#include <emmintrin.h>
	#endif

	@@ -1232,7 +1235,63 @@ static void ComputeLineNumbers(DiagnosticsEngine &Diag, ContentCache *FI,
	// Skip over the contents of the line.
	const unsigned char NextBuf = (const unsigned char )Buf;

	-#ifdef __SSE2__
	+#ifdef __SSE4_2__
	+ __m128i CRLF = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,'\r','\n');
	+ // First fix up the alignment to 16 bytes.
	+ while (((uintptr_t)NextBuf & (0xf)) != 0) {
	+ if (NextBuf == '\n' \|\| NextBuf == '\r' \|\| *NextBuf == '\0')
	+ goto FoundSpecialChar;
	+ ++NextBuf;
	+ }
	+ // Scan 16 byte chunks for '\r' and '\n'. Ignore '\0'.
	+ while (NextBuf+16 <= End) {
	+ const __m128i Chunk = (const __m128i)NextBuf;
	+ int index = _mm_cmpestri(CRLF, 2, Chunk, 16,
	+ _SIDD_UBYTE_OPS\|
	+ _SIDD_CMP_EQUAL_ANY\|
	+ _SIDD_MASKED_POSITIVE_POLARITY\|
	+ _SIDD_LEAST_SIGNIFICANT);
	+
	+ // If we found a newline, adjust the pointer and jump to the
	+ // handling code.
	+ if (index != 16) {
	+ NextBuf += index;
	+ goto FoundSpecialChar;
	+ }
	+ NextBuf += 16;
	+ }
	+#elif __AVX2__
	+ // Try to skip to the next newline using SSE instructions. This is very
	+ // performance sensitive for programs with lots of diagnostics and in -E
	+ // mode.
	+ __m128i CRs128 = _mm_set1_epi8('\r');
	+ __m256i CRs = _mm256_broadcastb_epi8(CRs128);
	+ __m128i LFs128 = _mm_set1_epi8('\n');
	+ __m256i LFs = _mm256_broadcastb_epi8(LFs128);
	+
	+ // First fix up the alignment to 32 bytes.
	+ while (((uintptr_t)NextBuf & (0x1f)) != 0) {
	+ if (NextBuf == '\n' \|\| NextBuf == '\r' \|\| *NextBuf == '\0')
	+ goto FoundSpecialChar;
	+ ++NextBuf;
	+ }
	+
	+ // Scan 32 byte chunks for '\r' and '\n'. Ignore '\0'.
	+ while (NextBuf+32 <= End) {
	+ const __m256i Chunk = (const __m256i)NextBuf;
	+ __m256i Cmp = _mm256_or_si256(_mm256_cmpeq_epi8(Chunk, CRs),
	+ _mm256_cmpeq_epi8(Chunk, LFs));
	+ unsigned Mask = _mm256_movemask_epi8(Cmp);
	+
	+ // If we found a newline, adjust the pointer and jump to the handling code.
	+ if (Mask != 0) {
	+ NextBuf += llvm::countTrailingZeros(Mask);
	+ goto FoundSpecialChar;
	+ }
	+ NextBuf += 32;
	+ }
	+
	+#elif __SSE2__
	// Try to skip to the next newline using SSE instructions. This is very
	// performance sensitive for programs with lots of diagnostics and in -E
	// mode.
	@@ -1265,7 +1324,7 @@ static void ComputeLineNumbers(DiagnosticsEngine &Diag, ContentCache *FI,
	while (NextBuf != '\n' && NextBuf != '\r' && *NextBuf != '\0')
	++NextBuf;

	-#ifdef __SSE2__
	+#if defined(__SSE2__) \|\| defined(__AVX2__)
	FoundSpecialChar:
	#endif
	Offs += NextBuf-Buf;
	diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
	index 0955cc5..10805bb 100644
	--- a/lib/Lex/Lexer.cpp
	+++ b/lib/Lex/Lexer.cpp
	@@ -2249,7 +2249,9 @@ static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
	return true;
	}

	-#ifdef __SSE2__
	+#ifdef __AVX2__
	+#include <immintrin.h>
	+#elif __SSE2__
	#include <emmintrin.h>
	#elif __ALTIVEC__
	#include <altivec.h>
	@@ -2306,13 +2308,32 @@ bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
	// If there is a code-completion point avoid the fast scan because it
	// doesn't check for '\0'.
	!(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
	- // While not aligned to a 16-byte boundary.
	- while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
	+
	+#ifdef __AVX2__
	+ const int alignment = 32;
	+#else
	+ const int alignment = 16;
	+#endif
	+ // While not properly aligned to the alignment we need
	+ while (C != '/' && ((intptr_t)CurPtr & (alignment-1)) != 0)
	C = *CurPtr++;

	if (C == '/') goto FoundSlash;
	-
	-#ifdef __SSE2__
	+#ifdef __AVX2__
	+ __m128i Slashes128 = _mm_set1_epi8('/');
	+ __m256i Slashes = _mm256_broadcastb_epi8(Slashes128);
	+ while (CurPtr+32 <= BufferEnd) {
	+ int cmp = _mm256_movemask_epi8(_mm256_cmpeq_epi8((const __m256i)CurPtr, Slashes));
	+ if (cmp != 0) {
	+ // Adjust the pointer to point directly after the first slash. It's
	+ // not necessary to set C here, it will be overwritten at the end of
	+ // the outer loop.
	+ CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
	+ goto FoundSlash;
	+ }
	+ CurPtr += 32;
	+ }
	+#elif __SSE2__
	__m128i Slashes = _mm_set1_epi8('/');
	while (CurPtr+16 <= BufferEnd) {
	int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8((const __m128i)CurPtr,