Last active
September 5, 2018 03:29
-
-
Save freelze/a1867e8c6caffb25ef11862456f60aa5 to your computer and use it in GitHub Desktop.
plurk media crawler (without response media), using multiprocessing speed up the crawling.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding:utf-8 -*- | |
""" | |
Compare MultiProcessing and Normal Way | |
Case:crawl 84 images in 81 plurk posts | |
(sec) | |
MultiProcessing : | |
117.3681275844574 | |
118.62501263618469 | |
122.00638699531555 | |
========================= | |
Normal Way: | |
376.74958324432373 | |
384.2063329219818 | |
""" | |
# ref: https://github.com/clsung/plurk-oauth | |
CONSUMER_KEY = '' | |
CONSUMER_SECRET = '' | |
ACCESS_TOKEN = '' | |
ACCESS_TOKEN_SECRET = '' | |
#import aiohttp | |
#import asyncio | |
#import multiprocessing as mp | |
from multiprocessing import Pool | |
#from pathos.multiprocessing import ProcessingPool as Pool | |
#from multiprocessing import Manager | |
#from pathos.pools import ProcessPool as Pool | |
#from multiprocessing import Process | |
#import threading | |
import re | |
import json | |
import os | |
#import urllib2 | |
import urllib | |
from plurk_oauth import PlurkAPI | |
import requests | |
import calendar | |
import time | |
from time import gmtime, strftime | |
from pprint import pprint | |
# https://stackoverflow.com/questions/1181919/python-base-36-encoding | |
import base36 | |
#Replace the keys and secrets in oauth_key.json with your app's. | |
#You can retrieve your app keys via the test tool at http://www.plurk.com/PlurkApp/ | |
#timeOffset = '2014-7-09T16:00:18' | |
#raw_json = plurk.callAPI('/APP/Timeline/getPlurks',{'limit':1000})['plurks'] #30 | |
#raw_json = plurk.callAPI('/APP/Profile/getOwnProfile')['plurks'] | |
#raw_json = plurk.callAPI('/APP/Timeline/getUnreadPlurks')['plurks'] #197 | |
#raw_json = plurk.callAPI('/APP/Polling/getPlurks',{'offset':'2013-1-20T21:55:34','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks'] #50 | |
#raw_json = plurk.callAPI('/APP/Realtime/getUserChannel')['plurks'] | |
#raw_json = plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':'dark42042n', 'offset':'2010-1-03T12:49:35','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks'] # limit = 30 | |
# ref: https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not | |
url_validation_regex = re.compile( | |
r'^(?:http|ftp)s?://' # http:// or https:// | |
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... | |
r'localhost|' #localhost... | |
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip | |
r'(?::\d+)?' # optional port | |
r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
# ref: https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module | |
def urlExists(path): | |
try: | |
r = requests.get(path,timeout=10) | |
#print('r =',r) | |
except requests.exceptions.RequestException as err: | |
print("OOps: Something Else",err) | |
return False | |
except requests.exceptions.HTTPError as errh: | |
print ("Http Error:",errh) | |
return False | |
except requests.exceptions.ConnectionError as errc: | |
print ("Error Connecting:",errc) | |
return False | |
except requests.exceptions.Timeout as errt: | |
print ("Timeout Error:",errt) | |
return False | |
else: | |
return r.status_code == requests.codes.ok | |
def getPublicPlurks( _plurk, _id, time_Offset ): | |
rawJson = _plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':_id, 'offset':time_Offset, 'limit':30, 'favorers_detail':False, 'limited_detail':False, 'replurkers_detail':False})['plurks'] | |
return rawJson | |
def plurkApiLogin(): | |
_plurk = PlurkAPI(CONSUMER_KEY, CONSUMER_SECRET) | |
_plurk.authorize(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) | |
checkToken = _plurk.callAPI('/APP/checkToken') | |
if (checkToken is None): | |
print("Your CONSUMER_KEY or CONSUMER_SECRET is wrong!") | |
time.sleep(1) | |
raise SystemExit | |
exit() | |
return _plurk | |
""" | |
class myThread(threading.Thread): | |
def __init__(self, queue): | |
self.user_queue = queue | |
self.total_user = [] | |
self.total_url = [] | |
#self.f_user = open('user.txt', 'a+') | |
#self.f_source = open('source.txt', 'a+') | |
threading.Thread.__init__(self) | |
def download(self, url): | |
res = requests.get(url) | |
source_list = [] | |
soup = BeautifulSoup(res.text) | |
iframes = soup.find_all('iframe') | |
tmp_source = [] | |
for i in iframes: | |
source = i.get('src', '').strip() | |
if source and source.find('https://www.tumblr.com/video') != -1 and source not in self.total_url: | |
source_list.append(source) | |
tmp_source.append(source) | |
print (u'新增链接:' + source) | |
tmp_user = [] | |
new_users = soup.find_all(class_='reblog-link') | |
for user in new_users: | |
username = user.text.strip() | |
if username and username not in self.total_user: | |
self.user_queue.put(username) | |
self.total_user.append(username) | |
tmp_user.append(username) | |
print (u'新增用户:' + username) | |
#mutex.acquire() | |
#if tmp_user: | |
#self.f_user.write('\n'.join(tmp_user) + '\n') | |
#if tmp_source: | |
#self.f_source.write('\n'.join(tmp_source) + '\n') | |
#mutex.release() | |
def run(self): | |
global is_exit | |
while not is_exit: | |
user = self.user_queue.get() | |
url = 'http://%s.tumblr.com/' % user | |
self.download(url) | |
time.sleep(2) | |
#self.f_user.close() | |
#self.f_source.close() | |
""" | |
basePath = os.getcwd() + '\\' | |
baseUrl = "https://www.plurk.com/p/" | |
""" | |
NUM_WORKERS = 30 | |
threads = [] | |
for i in range(NUM_WORKERS): | |
tumblr = Tumblr(q) | |
tumblr.setDaemon(True) | |
tumblr.start() | |
threads.append(tumblr) | |
while True: | |
for i in threads: | |
if not i.isAlive(): | |
break | |
time.sleep(1)""" | |
for i in raw_json: | |
multiInfoDict['postCount'] += 1 | |
multiInfoDict['thisPostMediaCount'] = 0 | |
if (i['owner_id'] != id): | |
print("posted:", i['posted']) | |
print("@@@@@@@@@@replurk@@@@@@@@@@") | |
continue | |
if (i['favorite_count'] > multiInfoDict['lowStandardFav']): | |
print("===================================================================================") | |
multiInfoDict['higherFavPostCount'] += 1 | |
owner_id_int = i['owner_id'] | |
owner_id = str(i['owner_id']) | |
print("owner_id:", i['owner_id']) | |
base36_plurk_id = str(base36.dumps(i['plurk_id'])) | |
print("postUrl:", baseUrl + base36_plurk_id) | |
print("posted:", i['posted']) | |
splitStr = i['posted'].split() | |
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num} | |
fileNameTime = splitStr[3] + '_' + str(abbr_to_num[splitStr[2]]) + '_' + splitStr[1] | |
print("******************") | |
print("porn:", i['porn']) | |
print("favorite_count:", i['favorite_count']) | |
print("response_count:", i['response_count']) | |
pprint(i['content_raw']) # type:str | |
_list = i['content'].split() | |
for content in _list: | |
if (content[0:4] == 'href'): | |
content = content[:-1] | |
if (content[-3:] == 'jpg'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".jpg" | |
path = basePath + image_name | |
if(os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'png'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".png" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'gif'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".gif" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'mp4'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".mp4" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
else: | |
print("others link:", content[6:]) | |
multiInfoDict = getResponses(_plurk, i['plurk_id'], owner_id_int, owner_id, fileNameTime, base36_plurk_id, multiInfoDict) | |
def getResponses(plurk, pID, owner_id_int, owner_id, fileNameTime, base36_plurk_id, thisPostMediaCount): | |
res_raw_json = plurk.callAPI('/APP/Responses/get', {'plurk_id':pID} ) | |
basePath = os.getcwd() + '\\' | |
# for loop each responses | |
response_count = 0 | |
response_media = 0 | |
for j in res_raw_json['responses']: | |
if (j['user_id'] == owner_id_int): | |
print("author content") | |
res_content_raw = j['content_raw'].split() | |
for responseLink in res_content_raw: | |
if (responseLink[-4:] == '.jpg'): | |
if (re.match(url_validation_regex, responseLink) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(responseLink) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
response_media += 1 | |
response_count += 1 | |
#multiInfoDict['downloadedMedia'] += 1 | |
thisPostMediaCount += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
thisPostMediaCount) + '-' + "response" + '-' + str( | |
response_count) + '-' + owner_id + ".jpg" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
#multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(responseLink)).content) | |
elif (responseLink[-4:] == '.png'): | |
if (re.match(url_validation_regex, responseLink) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(responseLink) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
response_media += 1 | |
response_count += 1 | |
#multiInfoDict['downloadedMedia'] += 1 | |
thisPostMediaCount += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
thisPostMediaCount) + '-' + "response" + '-' + str( | |
response_count) + '-' + owner_id + ".png" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
#multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(responseLink)).content) | |
elif (responseLink[-4:] == '.gif'): | |
if (re.match(url_validation_regex, responseLink) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(responseLink) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
response_media += 1 | |
response_count += 1 | |
#multiInfoDict['downloadedMedia'] += 1 | |
thisPostMediaCount += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
thisPostMediaCount) + '-' + "response" + '-' + str( | |
response_count) + '-' + owner_id + ".gif" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
#multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(responseLink)).content) | |
return multiInfoDict | |
def parsePostsJob(i): | |
#global _plurk | |
#globalVar() | |
#global id | |
plurk = plurkApiLogin() | |
userName = '' # userName you want to crawl | |
userSearch = plurk.callAPI('/APP/UserSearch/search', {'query': userName})['users'] | |
id = userSearch[0]['id'] | |
print('id=',id) | |
lowStandardFav = -1 | |
basePath = os.getcwd() + '\\' | |
baseUrl = "https://www.plurk.com/p/" | |
""" | |
NUM_WORKERS = 30 | |
threads = [] | |
for i in range(NUM_WORKERS): | |
tumblr = Tumblr(q) | |
tumblr.setDaemon(True) | |
tumblr.start() | |
threads.append(tumblr) | |
while True: | |
for i in threads: | |
if not i.isAlive(): | |
break | |
time.sleep(1)""" | |
thisPostMediaCount = 0 | |
#multiInfoDict['postCount'] += 1 | |
#multiInfoDict['thisPostMediaCount'] = 0 | |
#print("!!!!!!!!!!!!!!!") | |
#print(i) | |
#print("!!!!!!!!!!!!!!!") | |
if (i['owner_id'] != id): | |
print("posted:", i['posted']) | |
#print("@@@@@@@@@@replurk@@@@@@@@@@") | |
return | |
if (i['favorite_count'] > lowStandardFav): | |
print("===================================================================================") | |
#multiInfoDict['higherFavPostCount'] += 1 | |
owner_id_int = i['owner_id'] | |
owner_id = str(i['owner_id']) | |
print("owner_id:", i['owner_id']) | |
base36_plurk_id = str(base36.dumps(i['plurk_id'])) | |
print("postUrl:", baseUrl + base36_plurk_id) | |
print("posted:", i['posted']) | |
splitStr = i['posted'].split() | |
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num} | |
fileNameTime = splitStr[3] + '_' + str(abbr_to_num[splitStr[2]]) + '_' + splitStr[1] | |
print("******************") | |
print("porn:", i['porn']) | |
print("favorite_count:", i['favorite_count']) | |
print("response_count:", i['response_count']) | |
pprint(i['content_raw']) # type:str | |
_list = i['content'].split() | |
for content in _list: | |
if (content[0:4] == 'href'): | |
content = content[:-1] | |
if (content[-3:] == 'jpg'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
#multiInfoDict['downloadedMedia'] += 1 | |
thisPostMediaCount += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
thisPostMediaCount) + '-' + owner_id + ".jpg" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
#multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'png'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
#multiInfoDict['downloadedMedia'] += 1 | |
thisPostMediaCount += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
thisPostMediaCount) + '-' + owner_id + ".png" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
#multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'gif'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
#multiInfoDict['downloadedMedia'] += 1 | |
thisPostMediaCount += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
thisPostMediaCount) + '-' + owner_id + ".gif" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
#multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'mp4'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
#multiInfoDict['downloadedMedia'] += 1 | |
thisPostMediaCount += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
thisPostMediaCount) + '-' + owner_id + ".mp4" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
#multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
#multiInfoDict['downloadedMedia'] += 1 | |
#multiInfoDict['thisPostMediaCount'] += 1 | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
else: | |
print("others link:", content[6:]) | |
#getResponses(plurk, id, owner_id_int, owner_id, fileNameTime, base36_plurk_id, thisPostMediaCount) | |
#return multiInfoDict | |
def test_1(i): | |
print(type(i)) | |
# https://github.com/MorvanZhou/easy-scraping-tutorial/blob/master/notebook/4-2-asyncio.ipynb | |
if __name__ == "__main__": | |
t1 = time.time() | |
global plurk | |
plurk = plurkApiLogin() | |
global id | |
userName = '' # userName you want to crawl | |
userSearch = plurk.callAPI('/APP/UserSearch/search', {'query': userName})['users'] | |
if (len(userSearch) == 0): | |
userPlurkUrl = 'https://www.plurk.com/' + userName | |
userPlurkhtml = requests.get(userPlurkUrl, timeout=10) | |
id = 123 | |
print(userPlurkhtml) | |
# print(userName, " has block the search or you type a wrong userName.") | |
else: | |
id = userSearch[0]['id'] | |
print(userSearch[0]['display_name']) | |
# pool = mp.Pool(8) | |
timeOffset = strftime("%Y-%m-%dT%H:%M:%S", gmtime()) | |
while (True): | |
json = getPublicPlurks(plurk, id, timeOffset) | |
if (len(json) == 0): | |
break | |
splitStr = json[-1]['posted'].split() | |
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num} | |
timeOffset = splitStr[3] + '-' + str(abbr_to_num[splitStr[2]]) + '-' + splitStr[1] + 'T' + \ | |
splitStr[4] | |
print(timeOffset) | |
# Parse | |
global lowStandardFav | |
lowStandardFav = -1 | |
postCount = 0 | |
higherFavPostCount = 0 | |
downloadedMedia = 0 | |
response_media = 0 | |
#multiInfoDict = {'lowStandardFav': lowStandardFav, 'postCount': postCount, | |
# 'higherFavPostCount': higherFavPostCount, 'downloadedMedia': downloadedMedia, | |
# 'response_media': response_media, 'thisPostMediaCount': 0} | |
#for post in json: | |
#print(len(post)) | |
#test_1(post) | |
#print(post['favorite_count']) | |
#parsePostsJob(post) | |
#break | |
# | |
#manager = Manager() | |
#json_dict = manager.dict() | |
with Pool() as pool: | |
pool.map(parsePostsJob, json) | |
#parsePosts(plurk, json, userId, multiInfoDict) | |
#print(multiInfoDict) | |
#break | |
#main() | |
#loop = asyncio.get_event_loop() | |
#loop.run_until_complete(main(loop)) | |
print("Total time: ", time.time() - t1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment