-
-
Save able8/eb8eda2d4f9e068dd1df998d69e8aa87 to your computer and use it in GitHub Desktop.
A simple crawler based on requests and pyquery.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
''' | |
1. Construct the url with num_iid, eg: http://a.m.tmall.com/i15110720150.htm, 15110720150 is the num_iid. | |
2. Get the html text. | |
3. Parse the img urls and insert the num_iid and img urls into sqlite. | |
''' | |
import requests | |
from pyquery import PyQuery as pq | |
import threadpool | |
import urllib | |
def getNumiids(): | |
''' | |
Get num_iids from txt file or from search | |
''' | |
num_iid_list = [] | |
# 50025145 dress | |
url = r'http://list.tmall.com/search_product.htm?type=pc&totalPage=100&cat=50025145&style=l' | |
try: | |
page_sum = 100 | |
num_per_page = 90 | |
for page_num in range(page_sum): | |
print page_num | |
detail_url = url + '&jumpto=' + str(page_num + 1) | |
r = requests.get(detail_url, timeout=20) | |
if r.status_code == 200: | |
for i in range(num_per_page): | |
num_iid = pq(r.content)('body').find('div').filter('.product').eq(i).attr('data-id') | |
if(num_iid == None): | |
pass | |
else: | |
# print num_iid | |
num_iid_list.append(num_iid) | |
except Exception, e: | |
print e | |
# use set to unique | |
# num_iid_set = set(num_iid_list) | |
with open('num_iids.txt', 'w') as f: | |
for num_iid in num_iid_list: | |
f.write(str(num_iid) + '\n') | |
def getImgUrls(num_iid): | |
''' | |
Construct the url and get the img urls. | |
''' | |
url = r'http://a.m.tmall.com/i' + str(num_iid) + '.htm' | |
print url | |
img_urls = [] | |
try: | |
r = requests.get(url, timeout=10) | |
if r.status_code == 200: | |
for i in range(2): | |
img_url = pq(r.content)('div').filter('.bd').find('div').eq(0).find('img').eq(i).attr.src | |
if(img_url == None): | |
pass | |
else: | |
img_url = img_url[: img_url.find('jpg') + 3] | |
print img_url | |
img_urls.append(img_url) | |
return img_urls | |
else : | |
print 'status_code != 200', r.status_code | |
return [] | |
except Exception, e: | |
print e | |
return [] | |
def getCidNumiids(): | |
results = [] | |
with open('num_iids.txt', 'r') as f: | |
for line in f: | |
if len(line) != 0: | |
results.append(str(int(line))) | |
return results | |
def main(): | |
getNumiids() | |
thread_pool = threadpool.ThreadPool(threadpool.job) | |
id_list = getCidNumiids() | |
for num_id in id_list: | |
img_url_list = getImgUrls(num_id) | |
# feed the queue with urls | |
thread_pool.feed_queue(img_url_list) | |
# wait for the queue | |
thread_pool.wait_for_queue() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment