Skip to content

Instantly share code, notes, and snippets.

@freelze
Last active September 5, 2018 03:29
Show Gist options
  • Save freelze/a1867e8c6caffb25ef11862456f60aa5 to your computer and use it in GitHub Desktop.
Save freelze/a1867e8c6caffb25ef11862456f60aa5 to your computer and use it in GitHub Desktop.
plurk media crawler (without response media), using multiprocessing speed up the crawling.
#!/usr/bin/python
# -*- coding:utf-8 -*-
"""
Compare MultiProcessing and Normal Way
Case:crawl 84 images in 81 plurk posts
(sec)
MultiProcessing :
117.3681275844574
118.62501263618469
122.00638699531555
=========================
Normal Way:
376.74958324432373
384.2063329219818
"""
# ref: https://github.com/clsung/plurk-oauth
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_TOKEN_SECRET = ''
#import aiohttp
#import asyncio
#import multiprocessing as mp
from multiprocessing import Pool
#from pathos.multiprocessing import ProcessingPool as Pool
#from multiprocessing import Manager
#from pathos.pools import ProcessPool as Pool
#from multiprocessing import Process
#import threading
import re
import json
import os
#import urllib2
import urllib
from plurk_oauth import PlurkAPI
import requests
import calendar
import time
from time import gmtime, strftime
from pprint import pprint
# https://stackoverflow.com/questions/1181919/python-base-36-encoding
import base36
#Replace the keys and secrets in oauth_key.json with your app's.
#You can retrieve your app keys via the test tool at http://www.plurk.com/PlurkApp/
#timeOffset = '2014-7-09T16:00:18'
#raw_json = plurk.callAPI('/APP/Timeline/getPlurks',{'limit':1000})['plurks'] #30
#raw_json = plurk.callAPI('/APP/Profile/getOwnProfile')['plurks']
#raw_json = plurk.callAPI('/APP/Timeline/getUnreadPlurks')['plurks'] #197
#raw_json = plurk.callAPI('/APP/Polling/getPlurks',{'offset':'2013-1-20T21:55:34','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks'] #50
#raw_json = plurk.callAPI('/APP/Realtime/getUserChannel')['plurks']
#raw_json = plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':'dark42042n', 'offset':'2010-1-03T12:49:35','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks'] # limit = 30
# ref: https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
url_validation_regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
# ref: https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module
def urlExists(path):
try:
r = requests.get(path,timeout=10)
#print('r =',r)
except requests.exceptions.RequestException as err:
print("OOps: Something Else",err)
return False
except requests.exceptions.HTTPError as errh:
print ("Http Error:",errh)
return False
except requests.exceptions.ConnectionError as errc:
print ("Error Connecting:",errc)
return False
except requests.exceptions.Timeout as errt:
print ("Timeout Error:",errt)
return False
else:
return r.status_code == requests.codes.ok
def getPublicPlurks( _plurk, _id, time_Offset ):
rawJson = _plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':_id, 'offset':time_Offset, 'limit':30, 'favorers_detail':False, 'limited_detail':False, 'replurkers_detail':False})['plurks']
return rawJson
def plurkApiLogin():
_plurk = PlurkAPI(CONSUMER_KEY, CONSUMER_SECRET)
_plurk.authorize(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
checkToken = _plurk.callAPI('/APP/checkToken')
if (checkToken is None):
print("Your CONSUMER_KEY or CONSUMER_SECRET is wrong!")
time.sleep(1)
raise SystemExit
exit()
return _plurk
"""
class myThread(threading.Thread):
def __init__(self, queue):
self.user_queue = queue
self.total_user = []
self.total_url = []
#self.f_user = open('user.txt', 'a+')
#self.f_source = open('source.txt', 'a+')
threading.Thread.__init__(self)
def download(self, url):
res = requests.get(url)
source_list = []
soup = BeautifulSoup(res.text)
iframes = soup.find_all('iframe')
tmp_source = []
for i in iframes:
source = i.get('src', '').strip()
if source and source.find('https://www.tumblr.com/video') != -1 and source not in self.total_url:
source_list.append(source)
tmp_source.append(source)
print (u'新增链接:' + source)
tmp_user = []
new_users = soup.find_all(class_='reblog-link')
for user in new_users:
username = user.text.strip()
if username and username not in self.total_user:
self.user_queue.put(username)
self.total_user.append(username)
tmp_user.append(username)
print (u'新增用户:' + username)
#mutex.acquire()
#if tmp_user:
#self.f_user.write('\n'.join(tmp_user) + '\n')
#if tmp_source:
#self.f_source.write('\n'.join(tmp_source) + '\n')
#mutex.release()
def run(self):
global is_exit
while not is_exit:
user = self.user_queue.get()
url = 'http://%s.tumblr.com/' % user
self.download(url)
time.sleep(2)
#self.f_user.close()
#self.f_source.close()
"""
basePath = os.getcwd() + '\\'
baseUrl = "https://www.plurk.com/p/"
"""
NUM_WORKERS = 30
threads = []
for i in range(NUM_WORKERS):
tumblr = Tumblr(q)
tumblr.setDaemon(True)
tumblr.start()
threads.append(tumblr)
while True:
for i in threads:
if not i.isAlive():
break
time.sleep(1)"""
for i in raw_json:
multiInfoDict['postCount'] += 1
multiInfoDict['thisPostMediaCount'] = 0
if (i['owner_id'] != id):
print("posted:", i['posted'])
print("@@@@@@@@@@replurk@@@@@@@@@@")
continue
if (i['favorite_count'] > multiInfoDict['lowStandardFav']):
print("===================================================================================")
multiInfoDict['higherFavPostCount'] += 1
owner_id_int = i['owner_id']
owner_id = str(i['owner_id'])
print("owner_id:", i['owner_id'])
base36_plurk_id = str(base36.dumps(i['plurk_id']))
print("postUrl:", baseUrl + base36_plurk_id)
print("posted:", i['posted'])
splitStr = i['posted'].split()
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
fileNameTime = splitStr[3] + '_' + str(abbr_to_num[splitStr[2]]) + '_' + splitStr[1]
print("******************")
print("porn:", i['porn'])
print("favorite_count:", i['favorite_count'])
print("response_count:", i['response_count'])
pprint(i['content_raw']) # type:str
_list = i['content'].split()
for content in _list:
if (content[0:4] == 'href'):
content = content[:-1]
if (content[-3:] == 'jpg'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".jpg"
path = basePath + image_name
if(os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'png'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".png"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'gif'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".gif"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'mp4'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".mp4"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
else:
print("others link:", content[6:])
multiInfoDict = getResponses(_plurk, i['plurk_id'], owner_id_int, owner_id, fileNameTime, base36_plurk_id, multiInfoDict)
def getResponses(plurk, pID, owner_id_int, owner_id, fileNameTime, base36_plurk_id, thisPostMediaCount):
res_raw_json = plurk.callAPI('/APP/Responses/get', {'plurk_id':pID} )
basePath = os.getcwd() + '\\'
# for loop each responses
response_count = 0
response_media = 0
for j in res_raw_json['responses']:
if (j['user_id'] == owner_id_int):
print("author content")
res_content_raw = j['content_raw'].split()
for responseLink in res_content_raw:
if (responseLink[-4:] == '.jpg'):
if (re.match(url_validation_regex, responseLink) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(responseLink) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
response_media += 1
response_count += 1
#multiInfoDict['downloadedMedia'] += 1
thisPostMediaCount += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
thisPostMediaCount) + '-' + "response" + '-' + str(
response_count) + '-' + owner_id + ".jpg"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
#multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(responseLink)).content)
elif (responseLink[-4:] == '.png'):
if (re.match(url_validation_regex, responseLink) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(responseLink) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
response_media += 1
response_count += 1
#multiInfoDict['downloadedMedia'] += 1
thisPostMediaCount += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
thisPostMediaCount) + '-' + "response" + '-' + str(
response_count) + '-' + owner_id + ".png"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
#multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(responseLink)).content)
elif (responseLink[-4:] == '.gif'):
if (re.match(url_validation_regex, responseLink) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(responseLink) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
response_media += 1
response_count += 1
#multiInfoDict['downloadedMedia'] += 1
thisPostMediaCount += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
thisPostMediaCount) + '-' + "response" + '-' + str(
response_count) + '-' + owner_id + ".gif"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
#multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(responseLink)).content)
return multiInfoDict
def parsePostsJob(i):
#global _plurk
#globalVar()
#global id
plurk = plurkApiLogin()
userName = '' # userName you want to crawl
userSearch = plurk.callAPI('/APP/UserSearch/search', {'query': userName})['users']
id = userSearch[0]['id']
print('id=',id)
lowStandardFav = -1
basePath = os.getcwd() + '\\'
baseUrl = "https://www.plurk.com/p/"
"""
NUM_WORKERS = 30
threads = []
for i in range(NUM_WORKERS):
tumblr = Tumblr(q)
tumblr.setDaemon(True)
tumblr.start()
threads.append(tumblr)
while True:
for i in threads:
if not i.isAlive():
break
time.sleep(1)"""
thisPostMediaCount = 0
#multiInfoDict['postCount'] += 1
#multiInfoDict['thisPostMediaCount'] = 0
#print("!!!!!!!!!!!!!!!")
#print(i)
#print("!!!!!!!!!!!!!!!")
if (i['owner_id'] != id):
print("posted:", i['posted'])
#print("@@@@@@@@@@replurk@@@@@@@@@@")
return
if (i['favorite_count'] > lowStandardFav):
print("===================================================================================")
#multiInfoDict['higherFavPostCount'] += 1
owner_id_int = i['owner_id']
owner_id = str(i['owner_id'])
print("owner_id:", i['owner_id'])
base36_plurk_id = str(base36.dumps(i['plurk_id']))
print("postUrl:", baseUrl + base36_plurk_id)
print("posted:", i['posted'])
splitStr = i['posted'].split()
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
fileNameTime = splitStr[3] + '_' + str(abbr_to_num[splitStr[2]]) + '_' + splitStr[1]
print("******************")
print("porn:", i['porn'])
print("favorite_count:", i['favorite_count'])
print("response_count:", i['response_count'])
pprint(i['content_raw']) # type:str
_list = i['content'].split()
for content in _list:
if (content[0:4] == 'href'):
content = content[:-1]
if (content[-3:] == 'jpg'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
#multiInfoDict['downloadedMedia'] += 1
thisPostMediaCount += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
thisPostMediaCount) + '-' + owner_id + ".jpg"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
#multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'png'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
#multiInfoDict['downloadedMedia'] += 1
thisPostMediaCount += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
thisPostMediaCount) + '-' + owner_id + ".png"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
#multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'gif'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
#multiInfoDict['downloadedMedia'] += 1
thisPostMediaCount += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
thisPostMediaCount) + '-' + owner_id + ".gif"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
#multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'mp4'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
#multiInfoDict['downloadedMedia'] += 1
thisPostMediaCount += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
thisPostMediaCount) + '-' + owner_id + ".mp4"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
#multiInfoDict['downloadedMedia'] -= 1
continue
#multiInfoDict['downloadedMedia'] += 1
#multiInfoDict['thisPostMediaCount'] += 1
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
else:
print("others link:", content[6:])
#getResponses(plurk, id, owner_id_int, owner_id, fileNameTime, base36_plurk_id, thisPostMediaCount)
#return multiInfoDict
def test_1(i):
print(type(i))
# https://github.com/MorvanZhou/easy-scraping-tutorial/blob/master/notebook/4-2-asyncio.ipynb
if __name__ == "__main__":
t1 = time.time()
global plurk
plurk = plurkApiLogin()
global id
userName = '' # userName you want to crawl
userSearch = plurk.callAPI('/APP/UserSearch/search', {'query': userName})['users']
if (len(userSearch) == 0):
userPlurkUrl = 'https://www.plurk.com/' + userName
userPlurkhtml = requests.get(userPlurkUrl, timeout=10)
id = 123
print(userPlurkhtml)
# print(userName, " has block the search or you type a wrong userName.")
else:
id = userSearch[0]['id']
print(userSearch[0]['display_name'])
# pool = mp.Pool(8)
timeOffset = strftime("%Y-%m-%dT%H:%M:%S", gmtime())
while (True):
json = getPublicPlurks(plurk, id, timeOffset)
if (len(json) == 0):
break
splitStr = json[-1]['posted'].split()
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
timeOffset = splitStr[3] + '-' + str(abbr_to_num[splitStr[2]]) + '-' + splitStr[1] + 'T' + \
splitStr[4]
print(timeOffset)
# Parse
global lowStandardFav
lowStandardFav = -1
postCount = 0
higherFavPostCount = 0
downloadedMedia = 0
response_media = 0
#multiInfoDict = {'lowStandardFav': lowStandardFav, 'postCount': postCount,
# 'higherFavPostCount': higherFavPostCount, 'downloadedMedia': downloadedMedia,
# 'response_media': response_media, 'thisPostMediaCount': 0}
#for post in json:
#print(len(post))
#test_1(post)
#print(post['favorite_count'])
#parsePostsJob(post)
#break
#
#manager = Manager()
#json_dict = manager.dict()
with Pool() as pool:
pool.map(parsePostsJob, json)
#parsePosts(plurk, json, userId, multiInfoDict)
#print(multiInfoDict)
#break
#main()
#loop = asyncio.get_event_loop()
#loop.run_until_complete(main(loop))
print("Total time: ", time.time() - t1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment