Last active
August 29, 2015 14:00
-
-
Save ap-Codkelden/11333479 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import argparse | |
import time | |
import os | |
import sys | |
import codecs | |
import json | |
import glob | |
import re | |
URL = 'http://api.juick.com/messages?tag=' | |
session = requests.Session() | |
def repair (data): | |
return json.loads(data.content.replace(b'\n', b'').replace(b'\r', b'').decode(encoding='UTF-8')) | |
def gettag(tag, page): | |
time.sleep(0.7) | |
url = "{0}{1}&page={2}".format(URL,tag,page) | |
print (url) | |
r = session.get(url) | |
if r.status_code != 404: | |
return repair(r) | |
else: | |
return 404 | |
def getmsg(mid): | |
try: | |
url = "http://api.juick.com/thread?mid={0}".format(mid) | |
time.sleep(0.7) | |
print ("Try to retrieve mid #{0}".format(mid)) | |
r = session.get(url) | |
return repair(r) | |
except Exception as e: | |
raise e | |
def write(data, tag): | |
with codecs.open('{0}.json'.format(tag), 'a', 'utf-8') as f: | |
json.dump(data, f, ensure_ascii=False) | |
def big_json_split(): | |
try: | |
with open('posts.json') as infile: | |
o = json.load(infile) | |
chunkSize = 1000 | |
for i in range(0, len(o), chunkSize): | |
filename = 'file_' + str(i//chunkSize) + '.json' | |
with open(filename, 'w') as outfile: | |
json.dump(o[i:i+chunkSize], outfile, ensure_ascii=False) | |
print ("Записан {0}".format(filename)) | |
return glob.glob('file_*.json') | |
except Exception as e: | |
raise e | |
def make_tags_list(): | |
if os.path.isfile('tags.json'): | |
try: | |
with codecs.open('tags.json', 'r', 'utf-8') as f: | |
ff = json.loads(f.read()) | |
ff.append("созвездие") | |
print (ff) | |
return ff | |
except Exception as e: | |
raise e | |
else: | |
return ['созвездие'] | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('action', choices=['fetch', 'guess', 'extract']) | |
args = parser.parse_args() | |
TAGLIST = make_tags_list() | |
if args.action == 'fetch': | |
for tag in TAGLIST: | |
page = 1 | |
resp = 0 | |
collect = [] | |
while True: | |
print ("Try to retrieve page {0}".format(page)) | |
try: | |
resp = gettag(tag, page) | |
if resp != 404: | |
for item in resp: | |
collect.extend(getmsg(item['mid'])) | |
page+=1 | |
else: | |
print ("END") | |
break | |
except KeyboardInterrupt: | |
write(collect, tag) | |
os.sys.exit() | |
except Exception as e: | |
raise e | |
try: | |
write(collect, tag) | |
except Exception as e: | |
raise e | |
elif args.action == 'extract': | |
counter = 0 | |
constell_counter = 0 | |
constell_store = [] | |
if os.path.isfile('posts.json'): | |
print ("File exists. Splitting into chunks which contains 1000 items per chunk...") | |
json_files = big_json_split() | |
for j in json_files: | |
print ("Обрабатываю {0}...".format(j)) | |
with codecs.open(j, 'r', 'utf-8') as f: | |
ff = json.loads((f.read()).replace('\n', '').replace('\r', '')) | |
for element in ff: | |
if 'tags' in element[0]: | |
for t in TAGLIST: | |
if t in element[0]['tags']: | |
if t!='созвездие': | |
element[0]['tags'].remove(t) | |
element[0]['tags'].append('созвездие') | |
constell_store.append(element) | |
constell_counter+=1 | |
# чтобы не добавило два раза одно и то же, | |
# если запись содержит '*созвездие' и '*созведзие' | |
# Маловероятно, но Х3 | |
continue | |
counter+=1 | |
# пишем JSON | |
try: | |
with codecs.open('constell_extracted.json', 'w', 'utf-8') as f: | |
json.dump(constell_store, f, ensure_ascii=False) | |
except Exception as e: | |
raise e | |
for j in json_files: | |
try: | |
os.remove(j) | |
except: | |
raise | |
if constell_counter > 0: | |
print ( | |
"""Обработка завершена, обработано всего {0} постов, из них посты о "Созвездии" | |
составляют {1} ({2:.2f}%)""".format(counter, constell_counter, constell_counter/counter*100)) | |
else: | |
print("В этом бложике нет ни слова о 'Созвездии'. КГ/АМ!!!!11") | |
elif args.action == 'guess': | |
stored_tags = [] | |
m = re.compile('^с\S*е$') | |
if os.path.isfile('posts.json'): | |
json_files = big_json_split() | |
print(json_files) | |
for f in json_files: | |
print("Processing {0}".format(f)) | |
with codecs.open(f, 'r', 'utf-8') as f: | |
d = json.load(f) | |
for item in d: | |
if 'tags' in item[0]: | |
for tag in item[0]['tags']: | |
if (re.match(m, tag) is not None) and (not tag in stored_tags) and \ | |
tag != 'созвездие': | |
stored_tags.append(tag) | |
for j in json_files: | |
try: | |
os.remove(j) | |
except: | |
raise | |
print (chr(27) + "[2J") | |
if len (stored_tags) > 0: | |
print ( | |
"""Список тегов будет записан в файл `tags.json`. | |
Удалите из него те теги, которые НЕ ДОЛЖНЫ попасть в выборку.""") | |
with open('tags.json', 'w') as outfile: | |
json.dump(stored_tags, outfile, ensure_ascii=False) | |
else: | |
print('Нет тегов вида "с*е".') | |
else: | |
print("Stored posts file not found.") | |
os.sys.exit() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment