Skip to content

Instantly share code, notes, and snippets.

@SaitoAtsushi
Last active November 21, 2015 10:22
Show Gist options
  • Save SaitoAtsushi/89dd96dbb49d0bc565fa to your computer and use it in GitHub Desktop.
Save SaitoAtsushi/89dd96dbb49d0bc565fa to your computer and use it in GitHub Desktop.
word2vec を MinGW32 でビルドするためのパッチ
Index: word2phrase.c
===================================================================
--- word2phrase.c (リビジョン 42)
+++ word2phrase.c (作業コピー)
@@ -16,8 +16,9 @@
#include <stdlib.h>
#include <string.h>
#include <math.h>
-#include <pthread.h>
+#include <inttypes.h>
+#define PRIdLLD "%"PRId64
#define MAX_STRING 60
const int vocab_hash_size = 500000000; // Maximum 500M entries in the vocabulary
@@ -176,7 +177,7 @@
} else start = 0;
train_words++;
if ((debug_mode > 1) && (train_words % 100000 == 0)) {
- printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13);
+ printf("Words processed: " PRIdLLD "K Vocab size: " PRIdLLD "K %c", train_words / 1000, vocab_size / 1000, 13);
fflush(stdout);
}
i = SearchVocab(word);
@@ -197,8 +198,8 @@
}
SortVocab();
if (debug_mode > 0) {
- printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size);
- printf("Words in train file: %lld\n", train_words);
+ printf("\nVocab size (unigrams + bigrams): " PRIdLLD "\n", vocab_size);
+ printf("Words in train file: " PRIdLLD "\n", train_words);
}
fclose(fin);
}
@@ -223,7 +224,7 @@
}
cn++;
if ((debug_mode > 1) && (cn % 100000 == 0)) {
- printf("Words written: %lldK%c", cn / 1000, 13);
+ printf("Words written: " PRIdLLD "%c", cn / 1000, 13);
fflush(stdout);
}
oov = 0;
Index: word-analogy.c
===================================================================
--- word-analogy.c (リビジョン 42)
+++ word-analogy.c (作業コピー)
@@ -16,7 +16,10 @@
#include <string.h>
#include <math.h>
#include <malloc.h>
+#include <inttypes.h>
+#define PRIdLLD "%"PRId64
+
const long long max_size = 2000; // max length of strings
const long long N = 40; // number of closest words that will be shown
const long long max_w = 50; // max length of vocabulary entries
@@ -28,7 +31,6 @@
char file_name[max_size], st[100][max_size];
float dist, len, bestd[N], vec[max_size];
long long words, size, a, b, c, d, cn, bi[100];
- char ch;
float *M;
char *vocab;
if (argc < 2) {
@@ -41,12 +43,12 @@
printf("Input file not found\n");
return -1;
}
- fscanf(f, "%lld", &words);
- fscanf(f, "%lld", &size);
+ fscanf(f, PRIdLLD, &words);
+ fscanf(f, PRIdLLD, &size);
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
if (M == NULL) {
- printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
+ printf("Cannot allocate memory: " PRIdLLD " MB " PRIdLLD " " PRIdLLD "\n", (long long)words * size * sizeof(float) / 1048576, words, size);
return -1;
}
for (b = 0; b < words; b++) {
@@ -68,6 +70,7 @@
for (a = 0; a < N; a++) bestd[a] = 0;
for (a = 0; a < N; a++) bestw[a][0] = 0;
printf("Enter three words (EXIT to break): ");
+ fflush(stdout);
a = 0;
while (1) {
st1[a] = fgetc(stdin);
@@ -95,14 +98,14 @@
}
cn++;
if (cn < 3) {
- printf("Only %lld words were entered.. three words are needed at the input to perform the calculation\n", cn);
+ printf("Only " PRIdLLD " words were entered.. three words are needed at the input to perform the calculation\n", cn);
continue;
}
for (a = 0; a < cn; a++) {
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
if (b == words) b = 0;
bi[a] = b;
- printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
+ printf("\nWord: %s Position in vocabulary: " PRIdLLD "\n", st[a], bi[a]);
if (b == 0) {
printf("Out of dictionary word!\n");
break;
Index: compute-accuracy.c
===================================================================
--- compute-accuracy.c (リビジョン 42)
+++ compute-accuracy.c (作業コピー)
@@ -18,7 +18,10 @@
#include <math.h>
#include <malloc.h>
#include <ctype.h>
+#include <inttypes.h>
+#define PRIdLLD "%"PRId64
+
const long long max_size = 2000; // max length of strings
const long long N = 1; // number of closest words
const long long max_w = 50; // max length of vocabulary entries
@@ -26,7 +29,7 @@
int main(int argc, char **argv)
{
FILE *f;
- char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size], ch;
+ char st1[max_size], st2[max_size], st3[max_size], st4[max_size], bestw[N][max_size], file_name[max_size];
float dist, len, bestd[N], vec[max_size];
long long words, size, a, b, c, d, b1, b2, b3, threshold = 0;
float *M;
@@ -43,13 +46,13 @@
printf("Input file not found\n");
return -1;
}
- fscanf(f, "%lld", &words);
+ fscanf(f, PRIdLLD, &words);
if (threshold) if (words > threshold) words = threshold;
- fscanf(f, "%lld", &size);
+ fscanf(f, PRIdLLD, &size);
vocab = (char *)malloc(words * max_w * sizeof(char));
M = (float *)malloc(words * size * sizeof(float));
if (M == NULL) {
- printf("Cannot allocate memory: %lld MB\n", words * size * sizeof(float) / 1048576);
+ printf("Cannot allocate memory: " PRIdLLD " MB\n", words * size * sizeof(float) / 1048576);
return -1;
}
for (b = 0; b < words; b++) {
Index: distance.c
===================================================================
--- distance.c (リビジョン 42)
+++ distance.c (作業コピー)
@@ -16,7 +16,10 @@
#include <string.h>
#include <math.h>
#include <malloc.h>
+#include <inttypes.h>
+#define PRIdLLD "%"PRId64
+
const long long max_size = 2000; // max length of strings
const long long N = 40; // number of closest words that will be shown
const long long max_w = 50; // max length of vocabulary entries
@@ -28,7 +31,6 @@
char file_name[max_size], st[100][max_size];
float dist, len, bestd[N], vec[max_size];
long long words, size, a, b, c, d, cn, bi[100];
- char ch;
float *M;
char *vocab;
if (argc < 2) {
@@ -41,13 +43,13 @@
printf("Input file not found\n");
return -1;
}
- fscanf(f, "%lld", &words);
- fscanf(f, "%lld", &size);
+ fscanf(f, PRIdLLD, &words);
+ fscanf(f, PRIdLLD, &size);
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
if (M == NULL) {
- printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
+ printf("Cannot allocate memory: " PRIdLLD " MB " PRIdLLD " " PRIdLLD "\n", (long long)words * size * sizeof(float) / 1048576, words, size);
return -1;
}
for (b = 0; b < words; b++) {
@@ -69,6 +71,7 @@
for (a = 0; a < N; a++) bestd[a] = 0;
for (a = 0; a < N; a++) bestw[a][0] = 0;
printf("Enter word or sentence (EXIT to break): ");
+ fflush(stdout);
a = 0;
while (1) {
st1[a] = fgetc(stdin);
@@ -99,7 +102,7 @@
for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
if (b == words) b = -1;
bi[a] = b;
- printf("\nWord: %s Position in vocabulary: %lld\n", st[a], bi[a]);
+ printf("\nWord: %s Position in vocabulary: " PRIdLLD "\n", st[a], bi[a]);
if (b == -1) {
printf("Out of dictionary word!\n");
break;
Index: word2vec.c
===================================================================
--- word2vec.c (リビジョン 42)
+++ word2vec.c (作業コピー)
@@ -16,7 +16,11 @@
#include <stdlib.h>
#include <string.h>
#include <math.h>
-#include <pthread.h>
+#include <time.h>
+#include <inttypes.h>
+#include <windows.h>
+#include <process.h>
+#include <malloc.h>
#define MAX_STRING 100
#define EXP_TABLE_SIZE 1000
@@ -24,6 +28,8 @@
#define MAX_SENTENCE_LENGTH 1000
#define MAX_CODE_LENGTH 40
+#define PRIdLLD "%"PRId64
+
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
typedef float real; // Precision of float numbers
@@ -317,13 +323,13 @@
ReadWord(word, fin);
if (feof(fin)) break;
a = AddWordToVocab(word);
- fscanf(fin, "%lld%c", &vocab[a].cn, &c);
+ fscanf(fin, PRIdLLD"%c", &vocab[a].cn, &c);
i++;
}
SortVocab();
if (debug_mode > 0) {
- printf("Vocab size: %lld\n", vocab_size);
- printf("Words in train file: %lld\n", train_words);
+ printf("Vocab size: " PRIdLLD "\n", vocab_size);
+ printf("Words in train file: " PRIdLLD "\n", train_words);
}
fin = fopen(train_file, "rb");
if (fin == NULL) {
@@ -338,16 +344,17 @@
void InitNet() {
long long a, b;
unsigned long long next_random = 1;
- a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
+ syn0 = __mingw_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128);
+
if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
if (hs) {
- a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
+ syn1 = __mingw_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128);
if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
syn1[a * layer1_size + b] = 0;
}
if (negative>0) {
- a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
+ syn1neg = __mingw_aligned_malloc((long long)vocab_size * layer1_size * sizeof(real), 128);
if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
syn1neg[a * layer1_size + b] = 0;
@@ -359,11 +366,12 @@
CreateBinaryTree();
}
-void *TrainModelThread(void *id) {
+__stdcall unsigned int TrainModelThread(void *arg) {
+ long id = (long) arg;
long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
long long l1, l2, c, target, label, local_iter = iter;
- unsigned long long next_random = (long long)id;
+ unsigned long long next_random = (long long) id;
real f, g;
clock_t now;
real *neu1 = (real *)calloc(layer1_size, sizeof(real));
@@ -538,13 +546,14 @@
fclose(fi);
free(neu1);
free(neu1e);
- pthread_exit(NULL);
+ _endthreadex(0);
+ return 0; /* unreachable */
}
void TrainModel() {
long a, b, c, d;
FILE *fo;
- pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
+ HANDLE *pt = malloc(num_threads * sizeof(HANDLE));
printf("Starting training using file %s\n", train_file);
starting_alpha = alpha;
if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
@@ -553,12 +562,14 @@
InitNet();
if (negative > 0) InitUnigramTable();
start = clock();
- for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
- for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
+ for (a = 0; a < num_threads; a++)
+ pt[a] = (HANDLE) _beginthreadex(NULL, 0, TrainModelThread, (void *)a, 0, NULL);
+ WaitForMultipleObjects(num_threads, pt, TRUE, INFINITE);
+ for (a = 0; a < num_threads; a++) CloseHandle(pt[a]);
fo = fopen(output_file, "wb");
if (classes == 0) {
// Save the word vectors
- fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
+ fprintf(fo, PRIdLLD " " PRIdLLD "\n", vocab_size, layer1_size);
for (a = 0; a < vocab_size; a++) {
fprintf(fo, "%s ", vocab[a].word);
if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
Index: makefile
===================================================================
--- makefile (リビジョン 42)
+++ makefile (作業コピー)
@@ -1,6 +1,7 @@
CC = gcc
#Using -Ofast instead of -O3 might result in faster code, but is supported only by newer GCC versions
-CFLAGS = -lm -pthread -O3 -march=native -Wall -funroll-loops -Wno-unused-result
+CFLAGS = -lm -std=c99 -O3 -march=native -Wall -funroll-loops -Wno-unused-result
+EXT = .exe
all: word2vec word2phrase distance word-analogy compute-accuracy
@@ -17,4 +18,4 @@
chmod +x *.sh
clean:
- rm -rf word2vec word2phrase distance word-analogy compute-accuracy
\ No newline at end of file
+ rm -rf word2vec$(EXT) word2phrase$(EXT) distance$(EXT) word-analogy$(EXT) compute-accuracy$(EXT)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment