Created
January 26, 2018 19:11
-
-
Save nix010/14c3e17e1913bd50a611687ec6288a58 to your computer and use it in GitHub Desktop.
crawl pictures from Pinterest by search a keyword
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as BS | |
import requests | |
class BaseCrawler(object): | |
api_url = None | |
default_headers = { | |
'Accept-Language' :'en-US,en,q=0.9,vi;q=0.8', | |
'Cache-Control' :'no-cache', | |
'Connection' :'keep-alive', | |
'Content-Type' :'application/json', | |
'Accept' :'*/*', | |
'User-Agent' :'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36' | |
} | |
def __init__(self,email,password,user_id,**kwargs): | |
from django.contrib.auth.models import User | |
self.r = requests.Session() | |
def _get(self,url,params=None,headers=None,cookies=None): | |
if params is None: | |
params = {} | |
if cookies is None: | |
cookies = {} | |
h=self.default_headers | |
if headers: | |
h.update(headers) | |
return self.r.get(url,params=params,headers=h,cookies=cookies) | |
def _post(self,url,params=None,data=None,headers=None): | |
h=self.default_headers | |
if headers: | |
h.update(headers) | |
return self.r.post(url,data=data,headers=h) | |
def save_data_to_db(self): | |
pass | |
def crawl_now(self): | |
r = self.call_request() | |
self.parse_response_data(r) | |
self.save_data_to_db() | |
pass | |
def call_request(self): | |
pass | |
def parse_response_data(self,response): | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import requests | |
from allauth.socialaccount.models import SocialToken | |
from core.crawlers.base_crawler import BaseCrawler | |
from mmo_autobot.settings import PINTEREST_APP_ID, PINTEREST_APP_SECRET | |
class PinterestCrawler(BaseCrawler): | |
api_url = 'https://www.pinterest.com/resource/SearchResource/get/' | |
default_headers = dict({ | |
'X-Requested-With' : 'XMLHttpRequest', | |
'X-Pinterest-AppState' : 'active', | |
},**BaseCrawler.default_headers) | |
def __init__(self,keyword): | |
self.r = requests.Session() | |
self.keyword = keyword | |
self.params = { | |
'source_url' : '/search/pins/?q=%s' % keyword, | |
'rs' : 'typed', | |
'data' : json.dumps({ | |
'options' : { | |
'bookmarks' :['Y2JVSG81V2sxcmNHRlpWM1J5VFVad1YxWlVSbGhXTVZwSlZGWlZNVlV3TVZkalJFSlhUVzVTVkZWWGN6RldNa3BKVW14S1YxSnNjR2hYVm1ONFpXc3hWMVZ1U2xaaE0wSnpWVzAxUTJWR1pIRlVibVJXVW10d1NGa3dhRWRXVjBWNFUyeFNXbFpGV2pOV01GcExWMWRPUms5V1pGTmhNMEl5Vm1wSmQyVkdUblJXYkdScVVsWmFWMWxzYUVOaFJteFlaRVphYkdKSFVucFhhMXAzVkRGS2RHVkVRbGRXZWtJMFZrZDRXbVF4V2xWUmJGWnBWa1ZhV1ZkV1ZtRmtNVTVIVm14c1lWSlViRlJWYWtwUFRrWmFTR1ZHVGxWTmEzQlhWREZhVjFWc1pFaFVWR3hRWWtaYVNsbHVjRk5pUmtsNFkwVmFWazFxUm5wV1IzaEtaVVprZFZGc1ZtbFNNVXBOVjFaV1ZrMVdaRWRVYmxKT1ZqQmFXRlZ0ZEhkTlJscEZVbXhPYW1GNlZsZFVNVlpYVmtaa1NWRnNSbGRoTVhCSFZGWmFVMVpzY0VkVGJYaFRWa1phU2xaVVNYZGxSbEp6VjJ0YVYyRnNXbGxaYTFwTFVURndXR042VmxSU2EzQXdXVlZWTVdKSFJYZGpTR2hYVFc1U1ZGVnFTa2RXTWs1SFZteGFWMUpyY0ZKV1YzUnJWVEpPYzFWdVVtcFNWWEJ6V1Zod1IyVkdWbk5WYTA1WVlYcEdlVlJWVWtkV1YwWnlZMFpDV21KR1ZqUmFSVnBoVmxVeFJVMVVhR0ZoYTJ0M1ZGVmtVbVZHY0VoU2JURk9aVzF6ZDFkWE1WWk5SVFZGV2tkNFlXRnNSVEZVVmxKYVpWWndWV0pIYUdGaVZtdDRWMnhTYjJGc2NIRmhSM1JhWld4V00xUlZVbk5pUlRWeFZGUk9ZVlpHUlhsVU1HUkxZV3N4TmxGVVFrNVNSVlV3Vkd0U1QySkZNWEZYYldoYVZWUXdPV1pIVW1wWmVrbDZUVlJWTkU0eVVtMVpiVkUwVGpKWmVFOVVXbWhPYlVsNlRucGpNRTFIU1RKTlZFRTBXWHBDYUU5VVFYaFphbFV4V1hwV2JGa3lSbXROUjBsNlQxUmplRTVFVG14T2VrSnBUV3BhYkZwWFVUMD06VUhvNVQySXlOV3htUkU1clQxUk5NRnBVUVRKUFJHczFUVEpSZUU0eVZUVmFSRVV6VDBkT2JGbFVTVFJaVjFwdFRsUm5OVTU2U1hoYWFscG9UbnBzYkU1RVFYaGFWRUY1V2tkWk1rNHlSVFJOZW1ocVdWUmFhMDlYVVhoYVJGazl8ZmQyYWVhMzUwMjEyNzUzMTVhZTdmNDIxNzJkZjU0NDk0N2IxNjZmNTViOTkxOTQ0N2FjYTczZmE3OGJlMjliZg=='], | |
'filters' : "", | |
'query' : "%s" % keyword, | |
'scope' : "pins" | |
}, | |
"context" : {} | |
}) | |
} | |
def parse_response_data(self,response): | |
resp = response.json()['resource_response'] | |
if resp.get('error'): | |
raise Exception(str(response) + resp.get('error')) | |
self.parsed_data = [ pic.get('images')['orig']['url'] for pic in resp.get('data',[]) ] | |
def save_data_to_db(self): | |
from fbpage.models import PictureContent,Keyword | |
keyword_model, _ = Keyword.objects.get_or_create( | |
name=self.keyword, | |
defaults=dict( | |
name=self.keyword, | |
) | |
) | |
for pic_url in self.parsed_data: | |
PictureContent.objects.get_or_create( | |
user_id = self.user_id, | |
picture_url = pic_url, | |
keyword = keyword_model | |
) | |
def call_request(self): | |
return self._get(self.api_url,params=self.params) | |
''' Sample a Pinterest reqwuest params | |
:bookmarks : a string you can get from catching the request from "/resource/SearchResourceBase/get/" | |
in the ChomeDeveloperTools (find it in XHR section when you enter a search on the web). The one i use | |
is hard-cored into request because it work :v . (TESTED) 26 Jan 2018 | |
{"options":{"bookmarks":["Y2JVSG81V2sxcmNHRlpWM1J5VFVad1YxWlVSbGhXTVZwSlZGWlZNVlV3TVZkalJFSlhUVzVTVkZWWGN6RldNa3BKVW14S1YxSnNjR2hYVm1ONFpXc3hWMVZ1U2xaaE0wSnpWVzAxUTJWR1pIRlVibVJXVW10d1NGa3dhRWRXVjBWNFUyeFNXbFpGV2pOV01GcExWMWRPUms5V1pGTmhNMEl5Vm1wSmQyVkdUblJXYkdScVVsWmFWMWxzYUVOaFJteFlaRVphYkdKSFVucFhhMXAzVkRGS2RHVkVRbGRXZWtJMFZrZDRXbVF4V2xWUmJGWnBWa1ZhV1ZkV1ZtRmtNVTVIVm14c1lWSlViRlJWYWtwUFRrWmFTR1ZHVGxWTmEzQlhWREZhVjFWc1pFaFVWR3hRWWtaYVNsbHVjRk5pUmtsNFkwVmFWazFxUm5wV1IzaEtaVVprZFZGc1ZtbFNNVXBOVjFaV1ZrMVdaRWRVYmxKT1ZqQmFXRlZ0ZEhkTlJscEZVbXhPYW1GNlZsZFVNVlpYVmtaa1NWRnNSbGRoTVhCSFZGWmFVMVpzY0VkVGJYaFRWa1phU2xaVVNYZGxSbEp6VjJ0YVYyRnNXbGxaYTFwTFVURndXR042VmxSU2EzQXdXVlZWTVdKSFJYZGpTR2hYVFc1U1ZGVnFTa2RXTWs1SFZteGFWMUpyY0ZKV1YzUnJWVEpPYzFWdVVtcFNWWEJ6V1Zod1IyVkdWbk5WYTA1WVlYcEdlVlJWVWtkV1YwWnlZMFpDV21KR1ZqUmFSVnBoVmxVeFJVMVVhR0ZoYTJ0M1ZGVmtVbVZHY0VoU2JURk9aVzF6ZDFkWE1WWk5SVFZGV2tkNFlXRnNSVEZVVmxKYVpWWndWV0pIYUdGaVZtdDRWMnhTYjJGc2NIRmhSM1JhWld4V00xUlZVbk5pUlRWeFZGUk9ZVlpHUlhsVU1HUkxZV3N4TmxGVVFrNVNSVlV3Vkd0U1QySkZNWEZYYldoYVZWUXdPV1pIVW1wWmVrbDZUVlJWTkU0eVVtMVpiVkUwVGpKWmVFOVVXbWhPYlVsNlRucGpNRTFIU1RKTlZFRTBXWHBDYUU5VVFYaFphbFV4V1hwV2JGa3lSbXROUjBsNlQxUmplRTVFVG14T2VrSnBUV3BhYkZwWFVUMD06VUhvNVQySXlOV3htUkU1clQxUk5NRnBVUVRKUFJHczFUVEpSZUU0eVZUVmFSRVV6VDBkT2JGbFVTVFJaVjFwdFRsUm5OVTU2U1hoYWFscG9UbnBzYkU1RVFYaGFWRUY1V2tkWk1rNHlSVFJOZW1ocVdWUmFhMDlYVVhoYVJGazl8ZmQyYWVhMzUwMjEyNzUzMTVhZTdmNDIxNzJkZjU0NDk0N2IxNjZmNTViOTkxOTQ0N2FjYTczZmE3OGJlMjliZg=="],"filters":"","query":"harry potter","scope":"pins"},"context":{}} | |
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"request_identifier":"874307195690", | |
"resource_data_cache":[...], | |
"resource":{...}, | |
"client_context":{...}, | |
"resource_response":{ | |
"data":[ | |
{ | |
"domain":"Uploaded by user", | |
"done_by_me":false, | |
"requires_advertiser_attribution":false, | |
"videos":null, | |
"tracking_params":"CwABAAAADDg3NDMwNzE5NTY5MAA", | |
"aggregated_pin_data":{ | |
"did_it_data":{ | |
"recommend_scores":[ | |
{ | |
"count":0, | |
"score":1 | |
}, | |
{ | |
"count":0, | |
"score":0.5 | |
}, | |
{ | |
"count":0, | |
"score":0 | |
} | |
], | |
"rating":-1, | |
"user_count":3, | |
"tags":[ | |
], | |
"images_count":0, | |
"recommended_count":3, | |
"details_count":3, | |
"type":"aggregateddiditdata" | |
}, | |
"id":"4793619930918430080", | |
"aggregated_stats":{ | |
"saves":38841, | |
"done":3 | |
} | |
}, | |
"image_signature":"c839d59c4cf008662871ed797ee84357", | |
"like_count":0, | |
"images":{ | |
"736x":{ | |
"url":"https://i.pinimg.com/736x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":347, | |
"height":498 | |
}, | |
"474x":{ | |
"url":"https://i.pinimg.com/474x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":347, | |
"height":498 | |
}, | |
"orig":{ | |
"url":"https://s-media-cache-ak0.pinimg.com/originals/c8/39/d5/c839d59c4cf008662871ed797ee84357.jpg", | |
"width":347, | |
"height":498 | |
}, | |
"136x136":{ | |
"url":"https://i.pinimg.com/136x136/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":136, | |
"height":136 | |
}, | |
"236x":{ | |
"url":"https://i.pinimg.com/236x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":236, | |
"height":338 | |
} | |
}, | |
"id":"324259241902931702", | |
"price_currency":"USD", | |
"is_promoted":false, | |
"description_html":"C", | |
"privacy":"public", | |
"grid_description":"C", | |
"comments":{ | |
"bookmark":null, | |
"data":[ | |
], | |
"uri":"/v3/pins/324259241902931702/comments/" | |
}, | |
"access":[ | |
], | |
"comment_count":0, | |
"board":{ | |
"is_collaborative":false, | |
"layout":"default", | |
"name":"Creative", | |
"privacy":"public", | |
"url":"/phamthaominh197/creative/", | |
"owner":{ | |
"id":"324259379329222479" | |
}, | |
"followed_by_me":false, | |
"type":"board", | |
"id":"324259310610038895", | |
"image_thumbnail_url":"https://s-media-cache-ak0.pinimg.com/upload/324259310610038895_board_thumbnail_2017-12-23-05-56-05_51593_60.jpg" | |
}, | |
"type":"pin", | |
"method":"uploaded", | |
"attribution":null, | |
"description":"C", | |
"price_value":0.0, | |
"additional_hide_reasons":[ | |
], | |
"native_creator":null, | |
"is_playable":false, | |
"debug_info_html":null, | |
"ad_match_reason":0, | |
"link":null, | |
"has_required_attribution_provider":false, | |
"view_tags":[ | |
], | |
"is_repin":true, | |
"pin360":null, | |
"liked_by_me":false, | |
"rich_summary":null, | |
"is_uploaded":true, | |
"pinner":{ | |
"username":"phamthaominh197", | |
"explicitly_followed_by_me":false, | |
"image_xlarge_url":"https://i.pinimg.com/280x280_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg", | |
"full_name":"Minh Pham", | |
"image_small_url":"https://i.pinimg.com/30x30_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg", | |
"type":"user", | |
"id":"324259379329222479", | |
"image_large_url":"https://i.pinimg.com/140x140_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg" | |
}, | |
"repin_count":0, | |
"created_at":"Tue, 31 May 2016 07:40:52 +0000", | |
"is_native":false, | |
"promoter":null, | |
"promoted_is_removable":false, | |
"buyable_product":null, | |
"dominant_color":"#232b2c", | |
"title":"", | |
"embed":null, | |
"is_quick_promotable":false, | |
"is_video":false, | |
"is_downstream_promotion":false | |
}, | |
... | |
] | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment