Last active
November 21, 2015 10:22
-
-
Save SaitoAtsushi/89dd96dbb49d0bc565fa to your computer and use it in GitHub Desktop.
word2vec を MinGW32 でビルドするためのパッチ
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Index: word2phrase.c | |
=================================================================== | |
--- word2phrase.c (リビジョン 42) | |
+++ word2phrase.c (作業コピー) | |
@@ -16,8 +16,9 @@ | |
#include <stdlib.h> | |
#include <string.h> | |
#include <math.h> | |
-#include <pthread.h> | |
+#include <inttypes.h> | |
+#define PRIdLLD "%"PRId64 | |
#define MAX_STRING 60 | |
const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary | |
@@ -176,7 +177,7 @@ | |
} else start = 0; | |
train_words++; | |
if ((debug_mode > 1) && (train_words % 100000 == 0)) { | |
- printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); | |
+ printf("Words processed: " PRIdLLD "K Vocab size: " PRIdLLD "K %c", train_words / 1000, vocab_size / 1000, 13); | |
fflush(stdout); | |
} | |
i = SearchVocab(word); | |
@@ -197,8 +198,8 @@ | |
} | |
SortVocab(); | |
if (debug_mode > 0) { | |
- printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); | |
- printf("Words in train file: %lld\n", train_words); | |
+ printf("\nVocab size (unigrams + bigrams): " PRIdLLD "\n", vocab_size); | |
+ printf("Words in train file: " PRIdLLD "\n", train_words); | |
} | |
fclose(fin); | |
} | |
@@ -223,7 +224,7 @@ | |
} | |
cn++; | |
if ((debug_mode > 1) && (cn % 100000 == 0)) { | |
- printf("Words written: %lldK%c", cn / 1000, 13); | |
+ printf("Words written: " PRIdLLD "%c", cn / 1000, 13); | |
fflush(stdout); | |
} | |
oov = 0; | |
Index: word-analogy.c | |
=================================================================== | |
--- word-analogy.c (リビジョン 42) | |
+++ word-analogy.c (作業コピー) | |
@@ -16,7 +16,10 @@ | |
#include <string.h> | |
#include <math.h> | |
#include <malloc.h> | |
+#include <inttypes.h> | |
+#define PRIdLLD "%"PRId64 | |
+ | |
const long long max_size = 2000; // max length of strings | |
const long long N = 40; // number of closest words that will be shown | |
const long long max_w = 50; // max length of vocabulary entries | |
@@ -28,7 +31,6 @@ | |
char file_name[max_size], st[100][max_size]; | |
float dist, len, bestd[N], vec[max_size]; | |
long long words, size, a, b, c, d, cn, bi[100]; | |
- char ch; | |
float *M; | |
char *vocab; | |
if (argc < 2) { | |
@@ -41,12 +43,12 @@ | |
printf("Input file not found\n"); | |
return -1; | |
} | |
- fscanf(f, "%lld", &words); | |
- fscanf(f, "%lld", &size); | |
+ fscanf(f, PRIdLLD, &words); | |
+ fscanf(f, PRIdLLD, &size); | |
vocab = (char *)malloc((long long)words * max_w * sizeof(char)); | |
M = (float *)malloc((long long)words * (long long)size * sizeof(float)); | |
if (M == NULL) { | |
- printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); | |
+ printf("Cannot allocate memory: " PRIdLLD " MB " PRIdLLD " " PRIdLLD "\n", (long long)words * size * sizeof(float) / 1048576, words, size); | |
return -1; | |
} | |
for (b = 0; b < words; b++) { | |
@@ -68,6 +70,7 @@ | |
for (a = 0; a < N; a++) bestd[a] = 0; | |
for (a = 0; a < N; a++) bestw[a][0] = 0; | |
printf("Enter three words (EXIT to break): "); | |
+ fflush(stdout); | |
a = 0; | |
while (1) { | |
st1[a] = fgetc(stdin); | |
@@ -95,14 +98,14 @@ | |
} | |
cn++; | |
if (cn < 3) { | |
- printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn); | |
+ printf("Only " PRIdLLD " words were entered.. three words are needed at the input to perform the calculation\n", cn); | |
continue; | |
} | |
for (a = 0; a < cn; a++) { | |
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; | |
if (b == words) b = 0; | |
bi[a] = b; | |
- printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); | |
+ printf("\nWord: %s Position in vocabulary: " PRIdLLD "\n", st[a], bi[a]); | |
if (b == 0) { | |
printf("Out of dictionary word!\n"); | |
break; | |
Index: compute-accuracy.c | |
=================================================================== | |
--- compute-accuracy.c (リビジョン 42) | |
+++ compute-accuracy.c (作業コピー) | |
@@ -18,7 +18,10 @@ | |
#include <math.h> | |
#include <malloc.h> | |
#include <ctype.h> | |
+#include <inttypes.h> | |
+#define PRIdLLD "%"PRId64 | |
+ | |
const long long max_size = 2000; // max length of strings | |
const long long N = 1; // number of closest words | |
const long long max_w = 50; // max length of vocabulary entries | |
@@ -26,7 +29,7 @@ | |
int main(int argc, char **argv) | |
{ | |
FILE *f; | |
- char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch; | |
+ char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size]; | |
float dist, len, bestd[N], vec[max_size]; | |
long long words, size, a, b, c, d, b1, b2, b3, threshold = 0; | |
float *M; | |
@@ -43,13 +46,13 @@ | |
printf("Input file not found\n"); | |
return -1; | |
} | |
- fscanf(f, "%lld", &words); | |
+ fscanf(f, PRIdLLD, &words); | |
if (threshold) if (words > threshold) words = threshold; | |
- fscanf(f, "%lld", &size); | |
+ fscanf(f, PRIdLLD, &size); | |
vocab = (char *)malloc(words * max_w * sizeof(char)); | |
M = (float *)malloc(words * size * sizeof(float)); | |
if (M == NULL) { | |
- printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576); | |
+ printf("Cannot allocate memory: " PRIdLLD " MB\n", words * size * sizeof(float) / 1048576); | |
return -1; | |
} | |
for (b = 0; b < words; b++) { | |
Index: distance.c | |
=================================================================== | |
--- distance.c (リビジョン 42) | |
+++ distance.c (作業コピー) | |
@@ -16,7 +16,10 @@ | |
#include <string.h> | |
#include <math.h> | |
#include <malloc.h> | |
+#include <inttypes.h> | |
+#define PRIdLLD "%"PRId64 | |
+ | |
const long long max_size = 2000; // max length of strings | |
const long long N = 40; // number of closest words that will be shown | |
const long long max_w = 50; // max length of vocabulary entries | |
@@ -28,7 +31,6 @@ | |
char file_name[max_size], st[100][max_size]; | |
float dist, len, bestd[N], vec[max_size]; | |
long long words, size, a, b, c, d, cn, bi[100]; | |
- char ch; | |
float *M; | |
char *vocab; | |
if (argc < 2) { | |
@@ -41,13 +43,13 @@ | |
printf("Input file not found\n"); | |
return -1; | |
} | |
- fscanf(f, "%lld", &words); | |
- fscanf(f, "%lld", &size); | |
+ fscanf(f, PRIdLLD, &words); | |
+ fscanf(f, PRIdLLD, &size); | |
vocab = (char *)malloc((long long)words * max_w * sizeof(char)); | |
for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); | |
M = (float *)malloc((long long)words * (long long)size * sizeof(float)); | |
if (M == NULL) { | |
- printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); | |
+ printf("Cannot allocate memory: " PRIdLLD " MB " PRIdLLD " " PRIdLLD "\n", (long long)words * size * sizeof(float) / 1048576, words, size); | |
return -1; | |
} | |
for (b = 0; b < words; b++) { | |
@@ -69,6 +71,7 @@ | |
for (a = 0; a < N; a++) bestd[a] = 0; | |
for (a = 0; a < N; a++) bestw[a][0] = 0; | |
printf("Enter word or sentence (EXIT to break): "); | |
+ fflush(stdout); | |
a = 0; | |
while (1) { | |
st1[a] = fgetc(stdin); | |
@@ -99,7 +102,7 @@ | |
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; | |
if (b == words) b = -1; | |
bi[a] = b; | |
- printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]); | |
+ printf("\nWord: %s Position in vocabulary: " PRIdLLD "\n", st[a], bi[a]); | |
if (b == -1) { | |
printf("Out of dictionary word!\n"); | |
break; | |
Index: word2vec.c | |
=================================================================== | |
--- word2vec.c (リビジョン 42) | |
+++ word2vec.c (作業コピー) | |
@@ -16,7 +16,11 @@ | |
#include <stdlib.h> | |
#include <string.h> | |
#include <math.h> | |
-#include <pthread.h> | |
+#include <time.h> | |
+#include <inttypes.h> | |
+#include <windows.h> | |
+#include <process.h> | |
+#include <malloc.h> | |
#define MAX_STRING 100 | |
#define EXP_TABLE_SIZE 1000 | |
@@ -24,6 +28,8 @@ | |
#define MAX_SENTENCE_LENGTH 1000 | |
#define MAX_CODE_LENGTH 40 | |
+#define PRIdLLD "%"PRId64 | |
+ | |
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary | |
typedef float real; // Precision of float numbers | |
@@ -317,13 +323,13 @@ | |
ReadWord(word, fin); | |
if (feof(fin)) break; | |
a = AddWordToVocab(word); | |
- fscanf(fin, "%lld%c", &vocab[a].cn, &c); | |
+ fscanf(fin, PRIdLLD"%c", &vocab[a].cn, &c); | |
i++; | |
} | |
SortVocab(); | |
if (debug_mode > 0) { | |
- printf("Vocab size: %lld\n", vocab_size); | |
- printf("Words in train file: %lld\n", train_words); | |
+ printf("Vocab size: " PRIdLLD "\n", vocab_size); | |
+ printf("Words in train file: " PRIdLLD "\n", train_words); | |
} | |
fin = fopen(train_file, "rb"); | |
if (fin == NULL) { | |
@@ -338,16 +344,17 @@ | |
void InitNet() { | |
long long a, b; | |
unsigned long long next_random = 1; | |
- a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real)); | |
+ syn0 = __mingw_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128); | |
+ | |
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);} | |
if (hs) { | |
- a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real)); | |
+ syn1 = __mingw_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128); | |
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);} | |
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) | |
syn1[a * layer1_size + b] = 0; | |
} | |
if (negative>0) { | |
- a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real)); | |
+ syn1neg = __mingw_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128); | |
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);} | |
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) | |
syn1neg[a * layer1_size + b] = 0; | |
@@ -359,11 +366,12 @@ | |
CreateBinaryTree(); | |
} | |
-void *TrainModelThread(void *id) { | |
+__stdcall unsigned int TrainModelThread(void *arg) { | |
+ long id = (long) arg; | |
long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; | |
long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; | |
long long l1, l2, c, target, label, local_iter = iter; | |
- unsigned long long next_random = (long long)id; | |
+ unsigned long long next_random = (long long) id; | |
real f, g; | |
clock_t now; | |
real *neu1 = (real *)calloc(layer1_size, sizeof(real)); | |
@@ -538,13 +546,14 @@ | |
fclose(fi); | |
free(neu1); | |
free(neu1e); | |
- pthread_exit(NULL); | |
+ _endthreadex(0); | |
+ return 0; /* unreachable */ | |
} | |
void TrainModel() { | |
long a, b, c, d; | |
FILE *fo; | |
- pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); | |
+ HANDLE *pt = malloc(num_threads * sizeof(HANDLE)); | |
printf("Starting training using file %s\n", train_file); | |
starting_alpha = alpha; | |
if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile(); | |
@@ -553,12 +562,14 @@ | |
InitNet(); | |
if (negative > 0) InitUnigramTable(); | |
start = clock(); | |
- for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); | |
- for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); | |
+ for (a = 0; a < num_threads; a++) | |
+ pt[a] = (HANDLE) _beginthreadex(NULL, 0, TrainModelThread, (void *)a, 0, NULL); | |
+ WaitForMultipleObjects(num_threads, pt, TRUE, INFINITE); | |
+ for (a = 0; a < num_threads; a++) CloseHandle(pt[a]); | |
fo = fopen(output_file, "wb"); | |
if (classes == 0) { | |
// Save the word vectors | |
- fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); | |
+ fprintf(fo, PRIdLLD " " PRIdLLD "\n", vocab_size, layer1_size); | |
for (a = 0; a < vocab_size; a++) { | |
fprintf(fo, "%s ", vocab[a].word); | |
if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); | |
Index: makefile | |
=================================================================== | |
--- makefile (リビジョン 42) | |
+++ makefile (作業コピー) | |
@@ -1,6 +1,7 @@ | |
CC = gcc | |
#Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions | |
-CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result | |
+CFLAGS = -lm -std=c99 -O3 -march=native -Wall -funroll-loops -Wno-unused-result | |
+EXT = .exe | |
all: word2vec word2phrase distance word-analogy compute-accuracy | |
@@ -17,4 +18,4 @@ | |
chmod +x *.sh | |
clean: | |
- rm -rf word2vec word2phrase distance word-analogy compute-accuracy | |
\ No newline at end of file | |
+ rm -rf word2vec$(EXT) word2phrase$(EXT) distance$(EXT) word-analogy$(EXT) compute-accuracy$(EXT) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment