Created
May 4, 2017 18:04
-
-
Save schwittlick/17179f5e7b45ed1c8d64cfe06df1e84a to your computer and use it in GitHub Desktop.
tgo structure sketch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
scrapers, which save raw, unpreprocessed text files: | |
- gdoors/src/python/scrapers/4chan.py | |
- gdoors/src/python/scrapers/reddit.py | |
preprocessing, which get's a text file as a parameter and filters out non-english and blacklisted vocabulary | |
- gdoors/src/python/preprocessing/filter.py | |
generators/w2v model training | |
- gdoors/src/python/generators/w2v.py | |
- gdoors/src/python/generators/d2v.py | |
- gdoors/src/python/generators/generator.py | |
unfinished/experiments | |
- gdoors/src/python/experiments/shannon_entropy_test.py | |
- gdoors/src/python/experiments/different_entropy_measure_test.py | |
- gdoors/src/python/experiments/reddit_subreddits_streaming_testing.py | |
- gdoors/src/python/experiments/cmu_play.py | |
tests | |
- gdoors/src/python/test/language_filter_test.py | |
- gdoors/src/python/test/entropy_comparison_test.py | |
## Scrape 4chan | |
cd src/python/scrapers/ | |
workon gdoors_scape | |
pip install -r requirements.txt | |
# adjust 4chan_settings.ini | |
python 4chan.py --output_file <gdoors_repo>/data/4chan_raw_scape_yyyy_mm_dd.txt # instantly scrapes 4chan at the moment | |
## Scrape reddit | |
cd src/python/scrapers/ | |
workon gdoors_scape | |
pip install -r requirements.txt | |
# adjust reddit_settings.ini | |
python reddit.py --output_file <gdoors_repo>/data/reddit_raw_scrape_yyyy_mm_dd.txt # sits all day and continuously scrapes | |
## Filter/Preprocess raw text | |
cd src/python/preprocessing/ | |
workon gdoors_filter | |
pip install -r requirements.txt | |
# adjust filter_settings.ini | |
python filter.py --input_file <gdoors_repo>/data/4chan_raw_scape_yyyy_mm_dd.txt --output_file <gdoors_repo>/data/4chan_filtered_yyyy_mm_dd.txt | |
## w2v/d2v training | |
cd src/python/w2v/ | |
workon gdoors_w2v | |
pip install -r requirements.txt | |
# adjust w2v_settings.ini | |
python train_w2v.py --input_file <gdoors_repo>/data/4chan_filtered_yyyy_mm_dd.txt --output_file <gdoors_repo>/data/4chan_w2v_model_yyyy_mm_dd.w2v | |
## Text generation | |
cd src/python/gdoors/generation/ | |
workon gdoors_generation | |
pip install -r requirements.txt | |
# adjust generation_settings.ini | |
python generate --w2v_model <gdoors_repo>/data/4chan_w2v_model_yyyy_mm_dd.w2v --output_file <gdoors_repo>/data/4chan_finished_yyyy_mm_dd-comments.txt |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment