Created
April 8, 2018 05:42
-
-
Save mugbya/42e52834a1c5b1600b2e72cad4f27f80 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from uuid import uuid1 | |
import logging | |
logger = logging.getLogger(__name__) | |
class TiebaSpider(scrapy.Spider): | |
name = "facebook" | |
allowed_domains = ['www.facebook.com'] | |
start_urls = ['https://www.facebook.com/diana.liu.31521'] | |
# start_urls = ['https://www.facebook.com/yang.liu.96'] | |
def parse(self, response): | |
res = {'id': str(uuid1()).replace('-', "")} | |
username = response.xpath('//*[@id="fb-timeline-cover-name"]/a/text()').extract() | |
if username: | |
res.update({'username': username[0]}) | |
# 头像 | |
profile_pic_url_hd = response.xpath('//*[@id="fbTimelineHeadline"]/div[3]/div/div/div/img/@src').extract() | |
if profile_pic_url_hd: | |
res.update({'profile_pic_url_hd': profile_pic_url_hd[0]}) | |
# 背景图 | |
profile_pic_url_bg = response.xpath('//*[@id="fbCoverImageContainer"]/img[1]/@src').extract() | |
if profile_pic_url_bg: | |
res.update({'profile_pic_url_bg': profile_pic_url_bg[0]}) | |
# 个人简介 | |
person_desc = response.xpath('//*[@id="pagelet_bio"]/div/ul/li/div/div/span/text()').extract() | |
if person_desc: | |
res.update({'person_desc': ' '.join(person_desc)}) | |
# 格言 | |
quotes = response.xpath('//*[@id="pagelet_quotes"]/div/ul/li/div/div/span/text()').extract() | |
if quotes: | |
res.update({'quotes': ' '.join(quotes)}) | |
education_list = [] | |
work_list = [] | |
hometownlist = [] | |
skill_list = [] | |
work_nodes = None | |
skills_node = None | |
education_nodes = None | |
hometown_nodes = response.xpath('//*[@id="pagelet_hometown"]/div/div/ul/li') | |
eduwork_nodes = response.xpath('//*[@id="pagelet_eduwork"]/div/div') | |
for node in eduwork_nodes: | |
sign = node.xpath('div/span/text()').extract() | |
if 'Work' in sign: | |
work_nodes = node.xpath('ul/li') | |
if 'Professional Skills' in sign: | |
skills_node = node.xpath('ul/li/div') | |
if 'Education' in sign: | |
education_nodes = node.xpath('ul/li') | |
favorites_nodes = response.xpath('//*[@id="favorites"]/div[2]/table/tbody') | |
for node in favorites_nodes: | |
sign = node.xpath('tr[1]/th/div/text()').extract() | |
if 'Music' in sign: | |
music_nodes = node.xpath('ul/li') | |
if 'Other' in sign: | |
others_node = node.xpath('ul/li/div') | |
if work_nodes: | |
for node in work_nodes: | |
title = "" | |
company = node.css('a::text').extract() | |
other_info = node.css('div::text').extract() | |
if other_info: | |
title = other_info[0] | |
other_info = other_info[1:] | |
work_dict = { | |
'company': company[0], | |
'title': title, | |
'other_info': other_info, | |
} | |
work_list.append(work_dict) | |
if skills_node: | |
skill_list = skills_node.xpath('a/text()').extract() | |
if education_nodes: | |
for node in education_nodes: | |
school = node.css('a::text').extract() | |
other_info = node.css('div::text').extract() | |
education_dict = { | |
"school": school[0], | |
"other_desc": other_info, | |
} | |
education_list.append(education_dict) | |
if hometown_nodes: | |
for node in hometown_nodes: | |
addr = node.css('a::text').extract() | |
desc = node.css('div::text').extract() | |
hometown_dict = { | |
'addr': addr[0], | |
'desc': desc[0], | |
} | |
hometownlist.append(hometown_dict) | |
res.update({ | |
'work_list': work_list, | |
'hometownlist': hometownlist, | |
'skill_list': skill_list, | |
'education_list': education_list, | |
}) | |
print(str(res).replace("'", '"')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment