Created
August 10, 2018 02:06
-
-
Save jobliz/c035bf2245a5dbd35df4a0f95b222eba to your computer and use it in GitHub Desktop.
Two step processing for loading the goodbooks-10k dataset into elasticsearch
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import csv | |
from elasticsearch_dsl.connections import connections | |
from elasticsearch_dsl import DocType, Text, Date, Search | |
from elasticsearch import Elasticsearch | |
connections.create_connection(hosts=['localhost'], timeout=20) | |
es = Elasticsearch() | |
ess = Search(using=es) | |
ES_MEDIA_INDEX = 'babelcodex_test' | |
ES_MEDIA_TYPE = 'media' | |
ES_MEDIA_ID_FIELD = 'id' | |
bulk_data = [] | |
with open('new_.csv', newline='') as csvfile: | |
reader = csv.reader(csvfile, delimiter=',') | |
for item in reader: | |
tag_names = item[2].split("|") | |
data_dict = { | |
'id': item[0], | |
'title': item[1], | |
'tags': tag_names | |
} | |
op_dict = { | |
"index": { | |
"_index": ES_MEDIA_INDEX, | |
"_type": ES_MEDIA_TYPE, | |
"_id": data_dict[ES_MEDIA_ID_FIELD] | |
} | |
} | |
bulk_data.append(op_dict) | |
bulk_data.append(data_dict) | |
request_body = { | |
"settings": { | |
"number_of_shards": 1, | |
"number_of_replicas": 0 | |
} | |
} | |
es.indices.create(index=ES_MEDIA_INDEX, body=request_body) | |
es.bulk(index=ES_MEDIA_INDEX, body=bulk_data, refresh=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import csv | |
import sys | |
import pandas as pd | |
from tqdm import tqdm | |
r = pd.read_csv('ratings.csv') | |
tr = pd.read_csv('to_read.csv') | |
b = pd.read_csv('books.csv') | |
t = pd.read_csv('tags.csv') | |
bt = pd.read_csv('book_tags.csv') | |
# Let us merge tag names into tag applications. | |
bt = bt.merge( t, on = 'tag_id' ) | |
# Why don't we merge book titles for good measure. | |
bt = bt.merge( b[[ 'goodreads_book_id', 'title']], on = 'goodreads_book_id' ) | |
# fix negative tag counts | |
bt.loc[ bt['count'] < 0, 'count'] = 0 | |
print("Collecting tags from book_tags.csv") | |
book_tags = {} | |
with tqdm(total=len(bt)) as pbar: | |
for index, row in bt.iterrows(): | |
if row['goodreads_book_id'] not in book_tags: | |
book_tags[row['goodreads_book_id']] = [] | |
book_tags[row['goodreads_book_id']].append(row['tag_name']) | |
pbar.update(1) | |
print("Creating new CSV file") | |
with open('new_.csv', 'w', newline='') as csvfile: | |
writer = csv.writer(csvfile, delimiter=',') | |
with tqdm(total=len(b)) as pbar: | |
for index, row in b.iterrows(): | |
tags = book_tags[row['goodreads_book_id']] | |
tag_string = '|'.join(tags) | |
writer.writerow([row['goodreads_book_id'], row['title'], tag_string]) | |
pbar.update(1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment