ap-Codkelden · August 29, 2015 14:00
diff --git a/starcapture.py b/starcapture.py
 import requests
 import argparse
 import time
 import os
 import sys
 import codecs
 import json
 import glob
 import re

 URL = 'http://api.juick.com/messages?tag='

 session = requests.Session()

 def repair (data):
  return json.loads(data.content.replace(b'\n', b'').replace(b'\r', b'').decode(encoding='UTF-8'))

 def gettag(tag, page): 
  time.sleep(0.7)
  url = "{0}{1}&page={2}".format(URL,tag,page)
  print (url)
  r = session.get(url)
  if r.status_code != 404:
    return repair(r)
  else:
    return 404

 def getmsg(mid):
  try:
    url = "http://api.juick.com/thread?mid={0}".format(mid)
    time.sleep(0.7)
    print ("Try to retrieve mid #{0}".format(mid))
    r = session.get(url)
    return repair(r)
  except Exception as e:
    raise e

 def write(data, tag):
  with codecs.open('{0}.json'.format(tag), 'a', 'utf-8') as f:
    json.dump(data, f, ensure_ascii=False)

 def big_json_split():
  try:
    with open('posts.json') as infile:
      o = json.load(infile)
      chunkSize = 1000
      for i in range(0, len(o), chunkSize):
        filename = 'file_' + str(i//chunkSize) + '.json'
        with open(filename, 'w') as outfile:
          json.dump(o[i:i+chunkSize], outfile, ensure_ascii=False)
        print ("Записан {0}".format(filename))
    return glob.glob('file_*.json')
  except Exception as e:
    raise e

 def make_tags_list():
  if os.path.isfile('tags.json'):
    try:
      with codecs.open('tags.json', 'r', 'utf-8') as f:
        ff = json.loads(f.read())
      ff.append("созвездие")
      print (ff)
      return ff
    except Exception as e:
      raise e
  else:
    return ['созвездие']

 def main():
  parser = argparse.ArgumentParser()
    
  parser.add_argument('action', choices=['fetch', 'guess', 'extract'])

  args = parser.parse_args()

  TAGLIST = make_tags_list()

  if args.action == 'fetch':
      for tag in TAGLIST:
        page = 1
        resp = 0
        collect = []
        while True:
          print ("Try to retrieve page {0}".format(page))
          try:
            resp = gettag(tag, page)
            if resp != 404:
              for item in resp:
                  collect.extend(getmsg(item['mid']))
              page+=1
            else:
              print ("END")
              break
          except KeyboardInterrupt:
            write(collect, tag)
            os.sys.exit()
          except Exception as e:
            raise e
      try:
        write(collect, tag)
      except Exception as e:
        raise e

  elif args.action == 'extract':
    counter = 0 
    constell_counter = 0 
    constell_store = []
    if os.path.isfile('posts.json'):
      print ("File exists. Splitting into chunks which contains 1000 items per chunk...")
      json_files = big_json_split()
      for j in json_files:
        print ("Обрабатываю {0}...".format(j))
        with codecs.open(j, 'r', 'utf-8') as f:
          ff = json.loads((f.read()).replace('\n', '').replace('\r', ''))
        for element in ff:
          if 'tags' in element[0]:
            for t in TAGLIST:
              if t in element[0]['tags']:
                if t!='созвездие':
                  element[0]['tags'].remove(t) 
                  element[0]['tags'].append('созвездие') 
                constell_store.append(element)
                constell_counter+=1
                # чтобы не добавило два раза одно и то же, 
                # если запись содержит '*созвездие' и '*созведзие'
                # Маловероятно, но Х3
                continue
          counter+=1
    # пишем JSON
    try:
      with codecs.open('constell_extracted.json', 'w', 'utf-8') as f:
        json.dump(constell_store, f, ensure_ascii=False)
    except Exception as e:
      raise e
    for j in json_files:
      try:
        os.remove(j)
      except:
        raise
    if constell_counter > 0:
      print (
 """Обработка завершена, обработано всего {0} постов, из них посты о "Созвездии"
 составляют {1} ({2:.2f}%)""".format(counter, constell_counter, constell_counter/counter*100))
    else:
      print("В этом бложике нет ни слова о 'Созвездии'. КГ/АМ!!!!11")

  elif args.action == 'guess':
    stored_tags = []
    m = re.compile('^с\S*е$')
    if os.path.isfile('posts.json'):
      json_files = big_json_split()
      print(json_files)
      for f in json_files:
        print("Processing {0}".format(f))
        with codecs.open(f, 'r', 'utf-8') as f:
          d = json.load(f)
      for item in d:
        if 'tags' in item[0]:
          for tag in item[0]['tags']:
            if (re.match(m, tag) is not None) and (not tag in stored_tags) and \
              tag != 'созвездие':
              stored_tags.append(tag)
      for j in json_files:
        try:
          os.remove(j)
        except:
          raise
      print (chr(27) + "[2J")
      if len (stored_tags) > 0:
        print (
 """Список тегов будет записан в файл `tags.json`. 
 Удалите из него те теги, которые НЕ ДОЛЖНЫ попасть в выборку.""")
        with open('tags.json', 'w') as outfile:
          json.dump(stored_tags, outfile, ensure_ascii=False)
      else:
        print('Нет тегов вида "с*е".')
    else:
      print("Stored posts file not found.")
      os.sys.exit()

 if __name__ == "__main__":
  main()
	import requests
	import argparse
	import time
	import os
	import sys
	import codecs
	import json
	import glob
	import re

	URL = 'http://api.juick.com/messages?tag='

	session = requests.Session()

	def repair (data):
	return json.loads(data.content.replace(b'\n', b'').replace(b'\r', b'').decode(encoding='UTF-8'))

	def gettag(tag, page):
	time.sleep(0.7)
	url = "{0}{1}&page={2}".format(URL,tag,page)
	print (url)
	r = session.get(url)
	if r.status_code != 404:
	return repair(r)
	else:
	return 404

	def getmsg(mid):
	try:
	url = "http://api.juick.com/thread?mid={0}".format(mid)
	time.sleep(0.7)
	print ("Try to retrieve mid #{0}".format(mid))
	r = session.get(url)
	return repair(r)
	except Exception as e:
	raise e

	def write(data, tag):
	with codecs.open('{0}.json'.format(tag), 'a', 'utf-8') as f:
	json.dump(data, f, ensure_ascii=False)

	def big_json_split():
	try:
	with open('posts.json') as infile:
	o = json.load(infile)
	chunkSize = 1000
	for i in range(0, len(o), chunkSize):
	filename = 'file_' + str(i//chunkSize) + '.json'
	with open(filename, 'w') as outfile:
	json.dump(o[i:i+chunkSize], outfile, ensure_ascii=False)
	print ("Записан {0}".format(filename))
	return glob.glob('file_*.json')
	except Exception as e:
	raise e

	def make_tags_list():
	if os.path.isfile('tags.json'):
	try:
	with codecs.open('tags.json', 'r', 'utf-8') as f:
	ff = json.loads(f.read())
	ff.append("созвездие")
	print (ff)
	return ff
	except Exception as e:
	raise e
	else:
	return ['созвездие']

	def main():
	parser = argparse.ArgumentParser()

	parser.add_argument('action', choices=['fetch', 'guess', 'extract'])

	args = parser.parse_args()

	TAGLIST = make_tags_list()

	if args.action == 'fetch':
	for tag in TAGLIST:
	page = 1
	resp = 0
	collect = []
	while True:
	print ("Try to retrieve page {0}".format(page))
	try:
	resp = gettag(tag, page)
	if resp != 404:
	for item in resp:
	collect.extend(getmsg(item['mid']))
	page+=1
	else:
	print ("END")
	break
	except KeyboardInterrupt:
	write(collect, tag)
	os.sys.exit()
	except Exception as e:
	raise e
	try:
	write(collect, tag)
	except Exception as e:
	raise e

	elif args.action == 'extract':
	counter = 0
	constell_counter = 0
	constell_store = []
	if os.path.isfile('posts.json'):
	print ("File exists. Splitting into chunks which contains 1000 items per chunk...")
	json_files = big_json_split()
	for j in json_files:
	print ("Обрабатываю {0}...".format(j))
	with codecs.open(j, 'r', 'utf-8') as f:
	ff = json.loads((f.read()).replace('\n', '').replace('\r', ''))
	for element in ff:
	if 'tags' in element[0]:
	for t in TAGLIST:
	if t in element[0]['tags']:
	if t!='созвездие':
	element[0]['tags'].remove(t)
	element[0]['tags'].append('созвездие')
	constell_store.append(element)
	constell_counter+=1
	# чтобы не добавило два раза одно и то же,
	# если запись содержит 'созвездие' и 'созведзие'
	# Маловероятно, но Х3
	continue
	counter+=1
	# пишем JSON
	try:
	with codecs.open('constell_extracted.json', 'w', 'utf-8') as f:
	json.dump(constell_store, f, ensure_ascii=False)
	except Exception as e:
	raise e
	for j in json_files:
	try:
	os.remove(j)
	except:
	raise
	if constell_counter > 0:
	print (
	"""Обработка завершена, обработано всего {0} постов, из них посты о "Созвездии"
	составляют {1} ({2:.2f}%)""".format(counter, constell_counter, constell_counter/counter*100))
	else:
	print("В этом бложике нет ни слова о 'Созвездии'. КГ/АМ!!!!11")

	elif args.action == 'guess':
	stored_tags = []
	m = re.compile('^с\S*е$')
	if os.path.isfile('posts.json'):
	json_files = big_json_split()
	print(json_files)
	for f in json_files:
	print("Processing {0}".format(f))
	with codecs.open(f, 'r', 'utf-8') as f:
	d = json.load(f)
	for item in d:
	if 'tags' in item[0]:
	for tag in item[0]['tags']:
	if (re.match(m, tag) is not None) and (not tag in stored_tags) and \
	tag != 'созвездие':
	stored_tags.append(tag)
	for j in json_files:
	try:
	os.remove(j)
	except:
	raise
	print (chr(27) + "[2J")
	if len (stored_tags) > 0:
	print (
	"""Список тегов будет записан в файл `tags.json`.
	Удалите из него те теги, которые НЕ ДОЛЖНЫ попасть в выборку.""")
	with open('tags.json', 'w') as outfile:
	json.dump(stored_tags, outfile, ensure_ascii=False)
	else:
	print('Нет тегов вида "с*е".')
	else:
	print("Stored posts file not found.")
	os.sys.exit()

	if __name__ == "__main__":
	main()