Last active
September 4, 2018 11:56
-
-
Save freelze/f56c683212479523b7be32c5ba22bdff to your computer and use it in GitHub Desktop.
slow plurk media(jpg,png,gif) crawler
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding:utf-8 -*- | |
# API: https://github.com/clsung/plurk-oauth | |
# You can retrieve your app keys via the test tool at http://www.plurk.com/PlurkApp/ | |
CONSUMER_KEY = '' | |
CONSUMER_SECRET = '' | |
ACCESS_TOKEN = '' | |
ACCESS_TOKEN_SECRET = '' | |
import re | |
import json | |
import os | |
import urllib | |
from plurk_oauth import PlurkAPI | |
import requests | |
import calendar | |
import time | |
from time import gmtime, strftime | |
from pprint import pprint | |
# https://stackoverflow.com/questions/1181919/python-base-36-encoding | |
import base36 | |
#timeOffset = '2014-7-09T16:00:18' | |
#raw_json = plurk.callAPI('/APP/Timeline/getPlurks',{'limit':1000})['plurks'] #30 | |
#raw_json = plurk.callAPI('/APP/Profile/getOwnProfile')['plurks'] | |
#raw_json = plurk.callAPI('/APP/Timeline/getUnreadPlurks')['plurks'] #197 | |
#raw_json = plurk.callAPI('/APP/Polling/getPlurks',{'offset':'2013-1-20T21:55:34','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks'] | |
#raw_json = plurk.callAPI('/APP/Realtime/getUserChannel')['plurks'] | |
#raw_json = plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':'','offset':'2010-1-03T12:49:35','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks'] # limit = 30 | |
# ref: https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not | |
url_validation_regex = re.compile( | |
r'^(?:http|ftp)s?://' # http:// or https:// | |
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain... | |
r'localhost|' #localhost... | |
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip | |
r'(?::\d+)?' # optional port | |
r'(?:/?|[/?]\S+)$', re.IGNORECASE) | |
# ref: https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module | |
def urlExists(path): | |
try: | |
r = requests.get(path,timeout=10) | |
#print('r =',r) | |
except requests.exceptions.RequestException as err: | |
print("OOps: Something Else",err) | |
return False | |
except requests.exceptions.HTTPError as errh: | |
print ("Http Error:",errh) | |
return False | |
except requests.exceptions.ConnectionError as errc: | |
print ("Error Connecting:",errc) | |
return False | |
except requests.exceptions.Timeout as errt: | |
print ("Timeout Error:",errt) | |
return False | |
else: | |
return r.status_code == requests.codes.ok | |
def getPublicPlurks( _plurk, _id, time_Offset ): | |
rawJson = _plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':_id, 'offset':time_Offset, 'limit':30, 'favorers_detail':False, 'limited_detail':False, 'replurkers_detail':False})['plurks'] | |
return rawJson | |
def plurkApiLogin(): | |
_plurk = PlurkAPI(CONSUMER_KEY, CONSUMER_SECRET) | |
_plurk.authorize(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) | |
checkToken = _plurk.callAPI('/APP/checkToken') | |
if (checkToken is None): | |
print("Your CONSUMER_KEY or CONSUMER_SECRET is wrong!") | |
time.sleep(1) | |
exit() | |
return _plurk | |
def main(): | |
plurk = plurkApiLogin() | |
userName = '' # The userName you want to crawl | |
userSearch = plurk.callAPI('/APP/UserSearch/search', {'query': userName})['users'] | |
if (len(userSearch) == 0): | |
userPlurkUrl = 'https://www.plurk.com/' + userName | |
userPlurkhtml = requests.get(userPlurkUrl, timeout=10) | |
userId = 123 | |
print(userPlurkhtml) | |
# print(userName, " has block the search or you type a wrong userName.") | |
else: | |
userId = userSearch[0]['id'] | |
print(userSearch[0]['display_name']) | |
pool = mp.Pool(8) | |
timeOffset = strftime("%Y-%m-%dT%H:%M:%S", gmtime()) | |
while(True): | |
json = getPublicPlurks(plurk, userId, timeOffset) | |
if (len(json) == 0): | |
break | |
splitStr = json[-1]['posted'].split() | |
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num} | |
timeOffset = splitStr[3] + '-' + str(abbr_to_num[splitStr[2]]) + '-' + splitStr[1] + 'T' + \ | |
splitStr[4] | |
print(timeOffset) | |
lowStandardFav = -1 | |
postCount= 0 | |
higherFavPostCount= 0 | |
downloadedMedia= 0 | |
response_media= 0 | |
multiInfoDict = {'lowStandardFav':lowStandardFav,'postCount':postCount,'higherFavPostCount':higherFavPostCount,'downloadedMedia':downloadedMedia,'response_media':response_media,'thisPostMediaCount':0} | |
multiInfoDict = parsePosts(plurk, json, userId, multiInfoDict) | |
print(multiInfoDict) | |
def getResponses(_plurk, pID, owner_id_int, owner_id, fileNameTime, base36_plurk_id, multiInfoDict): | |
res_raw_json = _plurk.callAPI('/APP/Responses/get', {'plurk_id':pID} ) | |
basePath = os.getcwd() + '\\' | |
# for loop each responses | |
response_count = 0 | |
response_media = 0 | |
for j in res_raw_json['responses']: | |
if (j['user_id'] == owner_id_int): | |
print("author content") | |
res_content_raw = j['content_raw'].split() | |
for responseLink in res_content_raw: | |
if (responseLink[-4:] == '.jpg'): | |
if (re.match(url_validation_regex, responseLink) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(responseLink) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
response_media += 1 | |
response_count += 1 | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + "response" + '-' + str( | |
response_count) + '-' + owner_id + ".jpg" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(responseLink)).content) | |
elif (responseLink[-4:] == '.png'): | |
if (re.match(url_validation_regex, responseLink) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(responseLink) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
response_media += 1 | |
response_count += 1 | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + "response" + '-' + str( | |
response_count) + '-' + owner_id + ".png" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(responseLink)).content) | |
elif (responseLink[-4:] == '.gif'): | |
if (re.match(url_validation_regex, responseLink) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(responseLink) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
response_media += 1 | |
response_count += 1 | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + "response" + '-' + str( | |
response_count) + '-' + owner_id + ".gif" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(responseLink)).content) | |
return multiInfoDict | |
def parsePosts(_plurk,raw_json,id,multiInfoDict): | |
basePath = os.getcwd() + '\\' | |
baseUrl = "https://www.plurk.com/p/" | |
for i in raw_json: | |
multiInfoDict['postCount'] += 1 | |
multiInfoDict['thisPostMediaCount'] = 0 | |
if (i['owner_id'] != id): | |
print("posted:", i['posted']) | |
print("@@@@@@@@@@replurk@@@@@@@@@@") | |
continue | |
if (i['favorite_count'] > multiInfoDict['lowStandardFav']): | |
print("===================================================================================") | |
multiInfoDict['higherFavPostCount'] += 1 | |
owner_id_int = i['owner_id'] | |
owner_id = str(i['owner_id']) | |
print("owner_id:", i['owner_id']) | |
base36_plurk_id = str(base36.dumps(i['plurk_id'])) | |
print("postUrl:", baseUrl + base36_plurk_id) | |
print("posted:", i['posted']) | |
splitStr = i['posted'].split() | |
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num} | |
fileNameTime = splitStr[3] + '_' + str(abbr_to_num[splitStr[2]]) + '_' + splitStr[1] | |
print("******************") | |
print("porn:", i['porn']) | |
print("favorite_count:", i['favorite_count']) | |
print("response_count:", i['response_count']) | |
pprint(i['content_raw']) # type:str | |
_list = i['content'].split() | |
for content in _list: | |
if (content[0:4] == 'href'): | |
content = content[:-1] | |
if (content[-3:] == 'jpg'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".jpg" | |
path = basePath + image_name | |
if(os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'png'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".png" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'gif'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".gif" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
elif (content[-3:] == 'mp4'): | |
if (re.match(url_validation_regex, str(content[6:])) is None): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
if (urlExists(str(content[6:])) == False): | |
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") | |
continue | |
print(content[6:]) | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str( | |
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".mp4" | |
path = basePath + image_name | |
if (os.path.isfile(path)): | |
print(image_name, "was already downloaded.") | |
multiInfoDict['downloadedMedia'] -= 1 | |
continue | |
multiInfoDict['downloadedMedia'] += 1 | |
multiInfoDict['thisPostMediaCount'] += 1 | |
with open(image_name, 'wb') as handler: | |
handler.write(requests.get(str(content[6:])).content) | |
else: | |
print("others link:", content[6:]) | |
multiInfoDict = getResponses(_plurk, i['plurk_id'], owner_id_int, owner_id, fileNameTime, base36_plurk_id, multiInfoDict) | |
return multiInfoDict | |
if __name__ == "__main__": | |
t1 = time.time() | |
main() | |
print("Total time: ", time.time() - t1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment