-
-
Save 123789987/1758b69687e94acf81af88859de6f59e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from uuid import uuid1 | |
import logging | |
logger = logging.getLogger(__name__) | |
class TiebaSpider(scrapy.Spider): | |
name = "facebook" | |
allowed_domains = ['www.facebook.com'] | |
start_urls = ['https://www.facebook.com/diana.liu.31521'] | |
# start_urls = ['https://www.facebook.com/yang.liu.96'] | |
def parse(self, response): | |
res = {'id': str(uuid1()).replace('-', "")} | |
username = response.xpath('//*[@id="fb-timeline-cover-name"]/a/text()').extract() | |
if username: | |
res.update({'username': username[0]}) | |
# 头像 | |
profile_pic_url_hd = response.xpath('//*[@id="fbTimelineHeadline"]/div[3]/div/div/div/img/@src').extract() | |
if profile_pic_url_hd: | |
res.update({'profile_pic_url_hd': profile_pic_url_hd[0]}) | |
# 背景图 | |
profile_pic_url_bg = response.xpath('//*[@id="fbCoverImageContainer"]/img[1]/@src').extract() | |
if profile_pic_url_bg: | |
res.update({'profile_pic_url_bg': profile_pic_url_bg[0]}) | |
# 个人简介 | |
person_desc = response.xpath('//*[@id="pagelet_bio"]/div/ul/li/div/div/span/text()').extract() | |
if person_desc: | |
res.update({'person_desc': ' '.join(person_desc)}) | |
# 格言 | |
quotes = response.xpath('//*[@id="pagelet_quotes"]/div/ul/li/div/div/span/text()').extract() | |
if quotes: | |
res.update({'quotes': ' '.join(quotes)}) | |
education_list = [] | |
work_list = [] | |
hometownlist = [] | |
skill_list = [] | |
work_nodes = None | |
skills_node = None | |
education_nodes = None | |
hometown_nodes = response.xpath('//*[@id="pagelet_hometown"]/div/div/ul/li') | |
eduwork_nodes = response.xpath('//*[@id="pagelet_eduwork"]/div/div') | |
for node in eduwork_nodes: | |
sign = node.xpath('div/span/text()').extract() | |
if 'Work' in sign: | |
work_nodes = node.xpath('ul/li') | |
if 'Professional Skills' in sign: | |
skills_node = node.xpath('ul/li/div') | |
if 'Education' in sign: | |
education_nodes = node.xpath('ul/li') | |
favorites_nodes = response.xpath('//*[@id="favorites"]/div[2]/table/tbody') | |
for node in favorites_nodes: | |
sign = node.xpath('tr[1]/th/div/text()').extract() | |
if 'Music' in sign: | |
music_nodes = node.xpath('ul/li') | |
if 'Other' in sign: | |
others_node = node.xpath('ul/li/div') | |
if work_nodes: | |
for node in work_nodes: | |
title = "" | |
company = node.css('a::text').extract() | |
other_info = node.css('div::text').extract() | |
if other_info: | |
title = other_info[0] | |
other_info = other_info[1:] | |
work_dict = { | |
'company': company[0], | |
'title': title, | |
'other_info': other_info, | |
} | |
work_list.append(work_dict) | |
if skills_node: | |
skill_list = skills_node.xpath('a/text()').extract() | |
if education_nodes: | |
for node in education_nodes: | |
school = node.css('a::text').extract() | |
other_info = node.css('div::text').extract() | |
education_dict = { | |
"school": school[0], | |
"other_desc": other_info, | |
} | |
education_list.append(education_dict) | |
if hometown_nodes: | |
for node in hometown_nodes: | |
addr = node.css('a::text').extract() | |
desc = node.css('div::text').extract() | |
hometown_dict = { | |
'addr': addr[0], | |
'desc': desc[0], | |
} | |
hometownlist.append(hometown_dict) | |
res.update({ | |
'work_list': work_list, | |
'hometownlist': hometownlist, | |
'skill_list': skill_list, | |
'education_list': education_list, | |
}) | |
print(str(res).replace("'", '"')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment