Skip to content

Instantly share code, notes, and snippets.

@freelze
Last active September 4, 2018 11:56
Show Gist options
  • Save freelze/f56c683212479523b7be32c5ba22bdff to your computer and use it in GitHub Desktop.
Save freelze/f56c683212479523b7be32c5ba22bdff to your computer and use it in GitHub Desktop.
slow plurk media(jpg,png,gif) crawler
#!/usr/bin/python
# -*- coding:utf-8 -*-
# API: https://github.com/clsung/plurk-oauth
# You can retrieve your app keys via the test tool at http://www.plurk.com/PlurkApp/
CONSUMER_KEY = ''
CONSUMER_SECRET = ''
ACCESS_TOKEN = ''
ACCESS_TOKEN_SECRET = ''
import re
import json
import os
import urllib
from plurk_oauth import PlurkAPI
import requests
import calendar
import time
from time import gmtime, strftime
from pprint import pprint
# https://stackoverflow.com/questions/1181919/python-base-36-encoding
import base36
#timeOffset = '2014-7-09T16:00:18'
#raw_json = plurk.callAPI('/APP/Timeline/getPlurks',{'limit':1000})['plurks'] #30
#raw_json = plurk.callAPI('/APP/Profile/getOwnProfile')['plurks']
#raw_json = plurk.callAPI('/APP/Timeline/getUnreadPlurks')['plurks'] #197
#raw_json = plurk.callAPI('/APP/Polling/getPlurks',{'offset':'2013-1-20T21:55:34','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks']
#raw_json = plurk.callAPI('/APP/Realtime/getUserChannel')['plurks']
#raw_json = plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':'','offset':'2010-1-03T12:49:35','limit':1000,'favorers_detail':False,'limited_detail':False,'replurkers_detail':False})['plurks'] # limit = 30
# ref: https://stackoverflow.com/questions/7160737/python-how-to-validate-a-url-in-python-malformed-or-not
url_validation_regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
# ref: https://stackoverflow.com/questions/16511337/correct-way-to-try-except-using-python-requests-module
def urlExists(path):
try:
r = requests.get(path,timeout=10)
#print('r =',r)
except requests.exceptions.RequestException as err:
print("OOps: Something Else",err)
return False
except requests.exceptions.HTTPError as errh:
print ("Http Error:",errh)
return False
except requests.exceptions.ConnectionError as errc:
print ("Error Connecting:",errc)
return False
except requests.exceptions.Timeout as errt:
print ("Timeout Error:",errt)
return False
else:
return r.status_code == requests.codes.ok
def getPublicPlurks( _plurk, _id, time_Offset ):
rawJson = _plurk.callAPI('/APP/Timeline/getPublicPlurks',{'user_id':_id, 'offset':time_Offset, 'limit':30, 'favorers_detail':False, 'limited_detail':False, 'replurkers_detail':False})['plurks']
return rawJson
def plurkApiLogin():
_plurk = PlurkAPI(CONSUMER_KEY, CONSUMER_SECRET)
_plurk.authorize(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
checkToken = _plurk.callAPI('/APP/checkToken')
if (checkToken is None):
print("Your CONSUMER_KEY or CONSUMER_SECRET is wrong!")
time.sleep(1)
exit()
return _plurk
def main():
plurk = plurkApiLogin()
userName = '' # The userName you want to crawl
userSearch = plurk.callAPI('/APP/UserSearch/search', {'query': userName})['users']
if (len(userSearch) == 0):
userPlurkUrl = 'https://www.plurk.com/' + userName
userPlurkhtml = requests.get(userPlurkUrl, timeout=10)
userId = 123
print(userPlurkhtml)
# print(userName, " has block the search or you type a wrong userName.")
else:
userId = userSearch[0]['id']
print(userSearch[0]['display_name'])
pool = mp.Pool(8)
timeOffset = strftime("%Y-%m-%dT%H:%M:%S", gmtime())
while(True):
json = getPublicPlurks(plurk, userId, timeOffset)
if (len(json) == 0):
break
splitStr = json[-1]['posted'].split()
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
timeOffset = splitStr[3] + '-' + str(abbr_to_num[splitStr[2]]) + '-' + splitStr[1] + 'T' + \
splitStr[4]
print(timeOffset)
lowStandardFav = -1
postCount= 0
higherFavPostCount= 0
downloadedMedia= 0
response_media= 0
multiInfoDict = {'lowStandardFav':lowStandardFav,'postCount':postCount,'higherFavPostCount':higherFavPostCount,'downloadedMedia':downloadedMedia,'response_media':response_media,'thisPostMediaCount':0}
multiInfoDict = parsePosts(plurk, json, userId, multiInfoDict)
print(multiInfoDict)
def getResponses(_plurk, pID, owner_id_int, owner_id, fileNameTime, base36_plurk_id, multiInfoDict):
res_raw_json = _plurk.callAPI('/APP/Responses/get', {'plurk_id':pID} )
basePath = os.getcwd() + '\\'
# for loop each responses
response_count = 0
response_media = 0
for j in res_raw_json['responses']:
if (j['user_id'] == owner_id_int):
print("author content")
res_content_raw = j['content_raw'].split()
for responseLink in res_content_raw:
if (responseLink[-4:] == '.jpg'):
if (re.match(url_validation_regex, responseLink) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(responseLink) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
response_media += 1
response_count += 1
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + "response" + '-' + str(
response_count) + '-' + owner_id + ".jpg"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(responseLink)).content)
elif (responseLink[-4:] == '.png'):
if (re.match(url_validation_regex, responseLink) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(responseLink) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
response_media += 1
response_count += 1
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + "response" + '-' + str(
response_count) + '-' + owner_id + ".png"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(responseLink)).content)
elif (responseLink[-4:] == '.gif'):
if (re.match(url_validation_regex, responseLink) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(responseLink) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
response_media += 1
response_count += 1
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + "response" + '-' + str(
response_count) + '-' + owner_id + ".gif"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(responseLink)).content)
return multiInfoDict
def parsePosts(_plurk,raw_json,id,multiInfoDict):
basePath = os.getcwd() + '\\'
baseUrl = "https://www.plurk.com/p/"
for i in raw_json:
multiInfoDict['postCount'] += 1
multiInfoDict['thisPostMediaCount'] = 0
if (i['owner_id'] != id):
print("posted:", i['posted'])
print("@@@@@@@@@@replurk@@@@@@@@@@")
continue
if (i['favorite_count'] > multiInfoDict['lowStandardFav']):
print("===================================================================================")
multiInfoDict['higherFavPostCount'] += 1
owner_id_int = i['owner_id']
owner_id = str(i['owner_id'])
print("owner_id:", i['owner_id'])
base36_plurk_id = str(base36.dumps(i['plurk_id']))
print("postUrl:", baseUrl + base36_plurk_id)
print("posted:", i['posted'])
splitStr = i['posted'].split()
abbr_to_num = {name: num for num, name in enumerate(calendar.month_abbr) if num}
fileNameTime = splitStr[3] + '_' + str(abbr_to_num[splitStr[2]]) + '_' + splitStr[1]
print("******************")
print("porn:", i['porn'])
print("favorite_count:", i['favorite_count'])
print("response_count:", i['response_count'])
pprint(i['content_raw']) # type:str
_list = i['content'].split()
for content in _list:
if (content[0:4] == 'href'):
content = content[:-1]
if (content[-3:] == 'jpg'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".jpg"
path = basePath + image_name
if(os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'png'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".png"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'gif'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".gif"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
elif (content[-3:] == 'mp4'):
if (re.match(url_validation_regex, str(content[6:])) is None):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
if (urlExists(str(content[6:])) == False):
print("FALSE!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
continue
print(content[6:])
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
image_name = fileNameTime + '-' + base36_plurk_id + '-' + str(
multiInfoDict['thisPostMediaCount']) + '-' + owner_id + ".mp4"
path = basePath + image_name
if (os.path.isfile(path)):
print(image_name, "was already downloaded.")
multiInfoDict['downloadedMedia'] -= 1
continue
multiInfoDict['downloadedMedia'] += 1
multiInfoDict['thisPostMediaCount'] += 1
with open(image_name, 'wb') as handler:
handler.write(requests.get(str(content[6:])).content)
else:
print("others link:", content[6:])
multiInfoDict = getResponses(_plurk, i['plurk_id'], owner_id_int, owner_id, fileNameTime, base36_plurk_id, multiInfoDict)
return multiInfoDict
if __name__ == "__main__":
t1 = time.time()
main()
print("Total time: ", time.time() - t1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment