Skip to content

Instantly share code, notes, and snippets.

@ap-Codkelden
Last active August 29, 2015 14:00
Show Gist options
  • Save ap-Codkelden/11333479 to your computer and use it in GitHub Desktop.
Save ap-Codkelden/11333479 to your computer and use it in GitHub Desktop.
import requests
import argparse
import time
import os
import sys
import codecs
import json
import glob
import re
URL = 'http://api.juick.com/messages?tag='
session = requests.Session()
def repair (data):
return json.loads(data.content.replace(b'\n', b'').replace(b'\r', b'').decode(encoding='UTF-8'))
def gettag(tag, page):
time.sleep(0.7)
url = "{0}{1}&page={2}".format(URL,tag,page)
print (url)
r = session.get(url)
if r.status_code != 404:
return repair(r)
else:
return 404
def getmsg(mid):
try:
url = "http://api.juick.com/thread?mid={0}".format(mid)
time.sleep(0.7)
print ("Try to retrieve mid #{0}".format(mid))
r = session.get(url)
return repair(r)
except Exception as e:
raise e
def write(data, tag):
with codecs.open('{0}.json'.format(tag), 'a', 'utf-8') as f:
json.dump(data, f, ensure_ascii=False)
def big_json_split():
try:
with open('posts.json') as infile:
o = json.load(infile)
chunkSize = 1000
for i in range(0, len(o), chunkSize):
filename = 'file_' + str(i//chunkSize) + '.json'
with open(filename, 'w') as outfile:
json.dump(o[i:i+chunkSize], outfile, ensure_ascii=False)
print ("Записан {0}".format(filename))
return glob.glob('file_*.json')
except Exception as e:
raise e
def make_tags_list():
if os.path.isfile('tags.json'):
try:
with codecs.open('tags.json', 'r', 'utf-8') as f:
ff = json.loads(f.read())
ff.append("созвездие")
print (ff)
return ff
except Exception as e:
raise e
else:
return ['созвездие']
def main():
parser = argparse.ArgumentParser()
parser.add_argument('action', choices=['fetch', 'guess', 'extract'])
args = parser.parse_args()
TAGLIST = make_tags_list()
if args.action == 'fetch':
for tag in TAGLIST:
page = 1
resp = 0
collect = []
while True:
print ("Try to retrieve page {0}".format(page))
try:
resp = gettag(tag, page)
if resp != 404:
for item in resp:
collect.extend(getmsg(item['mid']))
page+=1
else:
print ("END")
break
except KeyboardInterrupt:
write(collect, tag)
os.sys.exit()
except Exception as e:
raise e
try:
write(collect, tag)
except Exception as e:
raise e
elif args.action == 'extract':
counter = 0
constell_counter = 0
constell_store = []
if os.path.isfile('posts.json'):
print ("File exists. Splitting into chunks which contains 1000 items per chunk...")
json_files = big_json_split()
for j in json_files:
print ("Обрабатываю {0}...".format(j))
with codecs.open(j, 'r', 'utf-8') as f:
ff = json.loads((f.read()).replace('\n', '').replace('\r', ''))
for element in ff:
if 'tags' in element[0]:
for t in TAGLIST:
if t in element[0]['tags']:
if t!='созвездие':
element[0]['tags'].remove(t)
element[0]['tags'].append('созвездие')
constell_store.append(element)
constell_counter+=1
# чтобы не добавило два раза одно и то же,
# если запись содержит '*созвездие' и '*созведзие'
# Маловероятно, но Х3
continue
counter+=1
# пишем JSON
try:
with codecs.open('constell_extracted.json', 'w', 'utf-8') as f:
json.dump(constell_store, f, ensure_ascii=False)
except Exception as e:
raise e
for j in json_files:
try:
os.remove(j)
except:
raise
if constell_counter > 0:
print (
"""Обработка завершена, обработано всего {0} постов, из них посты о "Созвездии"
составляют {1} ({2:.2f}%)""".format(counter, constell_counter, constell_counter/counter*100))
else:
print("В этом бложике нет ни слова о 'Созвездии'. КГ/АМ!!!!11")
elif args.action == 'guess':
stored_tags = []
m = re.compile('^с\S*е$')
if os.path.isfile('posts.json'):
json_files = big_json_split()
print(json_files)
for f in json_files:
print("Processing {0}".format(f))
with codecs.open(f, 'r', 'utf-8') as f:
d = json.load(f)
for item in d:
if 'tags' in item[0]:
for tag in item[0]['tags']:
if (re.match(m, tag) is not None) and (not tag in stored_tags) and \
tag != 'созвездие':
stored_tags.append(tag)
for j in json_files:
try:
os.remove(j)
except:
raise
print (chr(27) + "[2J")
if len (stored_tags) > 0:
print (
"""Список тегов будет записан в файл `tags.json`.
Удалите из него те теги, которые НЕ ДОЛЖНЫ попасть в выборку.""")
with open('tags.json', 'w') as outfile:
json.dump(stored_tags, outfile, ensure_ascii=False)
else:
print('Нет тегов вида "с*е".')
else:
print("Stored posts file not found.")
os.sys.exit()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment