Last active
February 26, 2019 09:49
-
-
Save nix010/b8d658c00761098aecd8d1a09ea5e9d3 to your computer and use it in GitHub Desktop.
Crawl pictures from Pinterest by search a keyword | 26 Jan, 2018 (TESED )
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup as BS | |
import requests | |
class BaseCrawler(object): | |
api_url = None | |
default_headers = { | |
'Accept-Language' :'en-US,en,q=0.9,vi;q=0.8', | |
'Cache-Control' :'no-cache', | |
'Connection' :'keep-alive', | |
'Content-Type' :'application/json', | |
'Accept' :'*/*', | |
'User-Agent' :'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/62.0.3202.94 Chrome/62.0.3202.94 Safari/537.36' | |
} | |
def __init__(self,email,password,user_id,**kwargs): | |
from django.contrib.auth.models import User | |
self.r = requests.Session() | |
def _get(self,url,params=None,headers=None,cookies=None): | |
if params is None: | |
params = {} | |
if cookies is None: | |
cookies = {} | |
h=self.default_headers | |
if headers: | |
h.update(headers) | |
return self.r.get(url,params=params,headers=h,cookies=cookies) | |
def _post(self,url,params=None,data=None,headers=None): | |
h=self.default_headers | |
if headers: | |
h.update(headers) | |
return self.r.post(url,data=data,headers=h) | |
def save_data_to_db(self): | |
pass | |
def crawl_now(self): | |
r = self.call_request() | |
self.parse_response_data(r) | |
self.save_data_to_db() | |
pass | |
def call_request(self): | |
pass | |
def parse_response_data(self,response): | |
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from core.crawlers.base_crawler import BaseCrawler # Just some helpers to call API | |
class PinterestCrawler(BaseCrawler): | |
api_url = 'https://www.pinterest.com/resource/SearchResource/get/' | |
default_headers = dict({ | |
'X-Requested-With' : 'XMLHttpRequest', | |
'X-Pinterest-AppState' : 'active', | |
},**BaseCrawler.default_headers) | |
def __init__(self,keyword): | |
self.r = requests.Session() | |
self.keyword = keyword | |
self.params = { | |
'source_url' : '/search/pins/?q=%s' % keyword, | |
'rs' : 'typed', | |
'data' : json.dumps({ | |
'options' : { | |
'bookmarks' :['Y2JVSG81V2sxcmNHRlpWM1J5VFVad1YxWlVSbGhXTVZwSlZGWlZNVlV3TVZkalJFSlhUVzVTVkZWWGN6RldNa3BKVW14S1YxSnNjR2hYVm1ONFpXc3hWMVZ1U2xaaE0wSnpWVzAxUTJWR1pIRlVibVJXVW10d1NGa3dhRWRXVjBWNFUyeFNXbFpGV2pOV01GcExWMWRPUms5V1pGTmhNMEl5Vm1wSmQyVkdUblJXYkdScVVsWmFWMWxzYUVOaFJteFlaRVphYkdKSFVucFhhMXAzVkRGS2RHVkVRbGRXZWtJMFZrZDRXbVF4V2xWUmJGWnBWa1ZhV1ZkV1ZtRmtNVTVIVm14c1lWSlViRlJWYWtwUFRrWmFTR1ZHVGxWTmEzQlhWREZhVjFWc1pFaFVWR3hRWWtaYVNsbHVjRk5pUmtsNFkwVmFWazFxUm5wV1IzaEtaVVprZFZGc1ZtbFNNVXBOVjFaV1ZrMVdaRWRVYmxKT1ZqQmFXRlZ0ZEhkTlJscEZVbXhPYW1GNlZsZFVNVlpYVmtaa1NWRnNSbGRoTVhCSFZGWmFVMVpzY0VkVGJYaFRWa1phU2xaVVNYZGxSbEp6VjJ0YVYyRnNXbGxaYTFwTFVURndXR042VmxSU2EzQXdXVlZWTVdKSFJYZGpTR2hYVFc1U1ZGVnFTa2RXTWs1SFZteGFWMUpyY0ZKV1YzUnJWVEpPYzFWdVVtcFNWWEJ6V1Zod1IyVkdWbk5WYTA1WVlYcEdlVlJWVWtkV1YwWnlZMFpDV21KR1ZqUmFSVnBoVmxVeFJVMVVhR0ZoYTJ0M1ZGVmtVbVZHY0VoU2JURk9aVzF6ZDFkWE1WWk5SVFZGV2tkNFlXRnNSVEZVVmxKYVpWWndWV0pIYUdGaVZtdDRWMnhTYjJGc2NIRmhSM1JhWld4V00xUlZVbk5pUlRWeFZGUk9ZVlpHUlhsVU1HUkxZV3N4TmxGVVFrNVNSVlV3Vkd0U1QySkZNWEZYYldoYVZWUXdPV1pIVW1wWmVrbDZUVlJWTkU0eVVtMVpiVkUwVGpKWmVFOVVXbWhPYlVsNlRucGpNRTFIU1RKTlZFRTBXWHBDYUU5VVFYaFphbFV4V1hwV2JGa3lSbXROUjBsNlQxUmplRTVFVG14T2VrSnBUV3BhYkZwWFVUMD06VUhvNVQySXlOV3htUkU1clQxUk5NRnBVUVRKUFJHczFUVEpSZUU0eVZUVmFSRVV6VDBkT2JGbFVTVFJaVjFwdFRsUm5OVTU2U1hoYWFscG9UbnBzYkU1RVFYaGFWRUY1V2tkWk1rNHlSVFJOZW1ocVdWUmFhMDlYVVhoYVJGazl8ZmQyYWVhMzUwMjEyNzUzMTVhZTdmNDIxNzJkZjU0NDk0N2IxNjZmNTViOTkxOTQ0N2FjYTczZmE3OGJlMjliZg=='], | |
'filters' : "", | |
'query' : "%s" % keyword, | |
'scope' : "pins" | |
}, | |
"context" : {} | |
}) | |
} | |
def parse_response_data(self,response): | |
resp = response.json()['resource_response'] | |
if resp.get('error'): | |
raise Exception(str(response) + resp.get('error')) | |
# This is the results after parse | |
self.parsed_data = [ pic.get('images')['orig']['url'] for pic in resp.get('data',[]) ] | |
def call_request(self): | |
return self._get(self.api_url,params=self.params) | |
''' Sample a Pinterest reqwuest params | |
:bookmarks : a string you can get from catching the request from "/resource/SearchResourceBase/get/" | |
in the ChomeDeveloperTools (find it in XHR section when you enter a search on the web). The one i use | |
is hard-cored into request because it work :v . (TESTED) 26 Jan 2018 | |
{"options":{"bookmarks":["Y2JVSG81V2sxcmNHRlpWM1J5VFVad1YxWlVSbGhXTVZwSlZGWlZNVlV3TVZkalJFSlhUVzVTVkZWWGN6RldNa3BKVW14S1YxSnNjR2hYVm1ONFpXc3hWMVZ1U2xaaE0wSnpWVzAxUTJWR1pIRlVibVJXVW10d1NGa3dhRWRXVjBWNFUyeFNXbFpGV2pOV01GcExWMWRPUms5V1pGTmhNMEl5Vm1wSmQyVkdUblJXYkdScVVsWmFWMWxzYUVOaFJteFlaRVphYkdKSFVucFhhMXAzVkRGS2RHVkVRbGRXZWtJMFZrZDRXbVF4V2xWUmJGWnBWa1ZhV1ZkV1ZtRmtNVTVIVm14c1lWSlViRlJWYWtwUFRrWmFTR1ZHVGxWTmEzQlhWREZhVjFWc1pFaFVWR3hRWWtaYVNsbHVjRk5pUmtsNFkwVmFWazFxUm5wV1IzaEtaVVprZFZGc1ZtbFNNVXBOVjFaV1ZrMVdaRWRVYmxKT1ZqQmFXRlZ0ZEhkTlJscEZVbXhPYW1GNlZsZFVNVlpYVmtaa1NWRnNSbGRoTVhCSFZGWmFVMVpzY0VkVGJYaFRWa1phU2xaVVNYZGxSbEp6VjJ0YVYyRnNXbGxaYTFwTFVURndXR042VmxSU2EzQXdXVlZWTVdKSFJYZGpTR2hYVFc1U1ZGVnFTa2RXTWs1SFZteGFWMUpyY0ZKV1YzUnJWVEpPYzFWdVVtcFNWWEJ6V1Zod1IyVkdWbk5WYTA1WVlYcEdlVlJWVWtkV1YwWnlZMFpDV21KR1ZqUmFSVnBoVmxVeFJVMVVhR0ZoYTJ0M1ZGVmtVbVZHY0VoU2JURk9aVzF6ZDFkWE1WWk5SVFZGV2tkNFlXRnNSVEZVVmxKYVpWWndWV0pIYUdGaVZtdDRWMnhTYjJGc2NIRmhSM1JhWld4V00xUlZVbk5pUlRWeFZGUk9ZVlpHUlhsVU1HUkxZV3N4TmxGVVFrNVNSVlV3Vkd0U1QySkZNWEZYYldoYVZWUXdPV1pIVW1wWmVrbDZUVlJWTkU0eVVtMVpiVkUwVGpKWmVFOVVXbWhPYlVsNlRucGpNRTFIU1RKTlZFRTBXWHBDYUU5VVFYaFphbFV4V1hwV2JGa3lSbXROUjBsNlQxUmplRTVFVG14T2VrSnBUV3BhYkZwWFVUMD06VUhvNVQySXlOV3htUkU1clQxUk5NRnBVUVRKUFJHczFUVEpSZUU0eVZUVmFSRVV6VDBkT2JGbFVTVFJaVjFwdFRsUm5OVTU2U1hoYWFscG9UbnBzYkU1RVFYaGFWRUY1V2tkWk1rNHlSVFJOZW1ocVdWUmFhMDlYVVhoYVJGazl8ZmQyYWVhMzUwMjEyNzUzMTVhZTdmNDIxNzJkZjU0NDk0N2IxNjZmNTViOTkxOTQ0N2FjYTczZmE3OGJlMjliZg=="],"filters":"","query":"harry potter","scope":"pins"},"context":{}} | |
''' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"request_identifier":"874307195690", | |
"resource_data_cache":[...], | |
"resource":{...}, | |
"client_context":{...}, | |
"resource_response":{ | |
"data":[ | |
{ | |
"domain":"Uploaded by user", | |
"done_by_me":false, | |
"requires_advertiser_attribution":false, | |
"videos":null, | |
"tracking_params":"CwABAAAADDg3NDMwNzE5NTY5MAA", | |
"aggregated_pin_data":{ | |
"did_it_data":{ | |
"recommend_scores":[ | |
{ | |
"count":0, | |
"score":1 | |
}, | |
{ | |
"count":0, | |
"score":0.5 | |
}, | |
{ | |
"count":0, | |
"score":0 | |
} | |
], | |
"rating":-1, | |
"user_count":3, | |
"tags":[ | |
], | |
"images_count":0, | |
"recommended_count":3, | |
"details_count":3, | |
"type":"aggregateddiditdata" | |
}, | |
"id":"4793619930918430080", | |
"aggregated_stats":{ | |
"saves":38841, | |
"done":3 | |
} | |
}, | |
"image_signature":"c839d59c4cf008662871ed797ee84357", | |
"like_count":0, | |
"images":{ | |
"736x":{ | |
"url":"https://i.pinimg.com/736x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":347, | |
"height":498 | |
}, | |
"474x":{ | |
"url":"https://i.pinimg.com/474x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":347, | |
"height":498 | |
}, | |
"orig":{ | |
"url":"https://s-media-cache-ak0.pinimg.com/originals/c8/39/d5/c839d59c4cf008662871ed797ee84357.jpg", | |
"width":347, | |
"height":498 | |
}, | |
"136x136":{ | |
"url":"https://i.pinimg.com/136x136/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":136, | |
"height":136 | |
}, | |
"236x":{ | |
"url":"https://i.pinimg.com/236x/c8/39/d5/c839d59c4cf008662871ed797ee84357--lily-potter-harry-potter-.jpg", | |
"width":236, | |
"height":338 | |
} | |
}, | |
"id":"324259241902931702", | |
"price_currency":"USD", | |
"is_promoted":false, | |
"description_html":"C", | |
"privacy":"public", | |
"grid_description":"C", | |
"comments":{ | |
"bookmark":null, | |
"data":[ | |
], | |
"uri":"/v3/pins/324259241902931702/comments/" | |
}, | |
"access":[ | |
], | |
"comment_count":0, | |
"board":{ | |
"is_collaborative":false, | |
"layout":"default", | |
"name":"Creative", | |
"privacy":"public", | |
"url":"/phamthaominh197/creative/", | |
"owner":{ | |
"id":"324259379329222479" | |
}, | |
"followed_by_me":false, | |
"type":"board", | |
"id":"324259310610038895", | |
"image_thumbnail_url":"https://s-media-cache-ak0.pinimg.com/upload/324259310610038895_board_thumbnail_2017-12-23-05-56-05_51593_60.jpg" | |
}, | |
"type":"pin", | |
"method":"uploaded", | |
"attribution":null, | |
"description":"C", | |
"price_value":0.0, | |
"additional_hide_reasons":[ | |
], | |
"native_creator":null, | |
"is_playable":false, | |
"debug_info_html":null, | |
"ad_match_reason":0, | |
"link":null, | |
"has_required_attribution_provider":false, | |
"view_tags":[ | |
], | |
"is_repin":true, | |
"pin360":null, | |
"liked_by_me":false, | |
"rich_summary":null, | |
"is_uploaded":true, | |
"pinner":{ | |
"username":"phamthaominh197", | |
"explicitly_followed_by_me":false, | |
"image_xlarge_url":"https://i.pinimg.com/280x280_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg", | |
"full_name":"Minh Pham", | |
"image_small_url":"https://i.pinimg.com/30x30_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg", | |
"type":"user", | |
"id":"324259379329222479", | |
"image_large_url":"https://i.pinimg.com/140x140_RS/97/2b/c4/972bc474022f188d4684f22f1032f127.jpg" | |
}, | |
"repin_count":0, | |
"created_at":"Tue, 31 May 2016 07:40:52 +0000", | |
"is_native":false, | |
"promoter":null, | |
"promoted_is_removable":false, | |
"buyable_product":null, | |
"dominant_color":"#232b2c", | |
"title":"", | |
"embed":null, | |
"is_quick_promotable":false, | |
"is_video":false, | |
"is_downstream_promotion":false | |
}, | |
... | |
] | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Can you tell me if it works with django?