mugbya · April 8, 2018 05:42
diff --git a/facebook_crawler.py b/facebook_crawler.py
 # -*- coding: utf-8 -*-
 import scrapy
 from uuid import uuid1
 import logging
 logger = logging.getLogger(__name__)


 class TiebaSpider(scrapy.Spider):
    name = "facebook"
    allowed_domains = ['www.facebook.com']
    start_urls = ['https://www.facebook.com/diana.liu.31521']
    # start_urls = ['https://www.facebook.com/yang.liu.96']

    def parse(self, response):
        res = {'id': str(uuid1()).replace('-', "")}
        username = response.xpath('//*[@id="fb-timeline-cover-name"]/a/text()').extract()
        if username:
            res.update({'username': username[0]})

        # 头像
        profile_pic_url_hd = response.xpath('//*[@id="fbTimelineHeadline"]/div[3]/div/div/div/img/@src').extract()
        if profile_pic_url_hd:
            res.update({'profile_pic_url_hd': profile_pic_url_hd[0]})

        # 背景图
        profile_pic_url_bg = response.xpath('//*[@id="fbCoverImageContainer"]/img[1]/@src').extract()
        if profile_pic_url_bg:
            res.update({'profile_pic_url_bg': profile_pic_url_bg[0]})

        # 个人简介
        person_desc = response.xpath('//*[@id="pagelet_bio"]/div/ul/li/div/div/span/text()').extract()
        if person_desc:
            res.update({'person_desc': ' '.join(person_desc)})

        # 格言
        quotes = response.xpath('//*[@id="pagelet_quotes"]/div/ul/li/div/div/span/text()').extract()
        if quotes:
            res.update({'quotes': ' '.join(quotes)})

        education_list = []
        work_list = []
        hometownlist = []
        skill_list = []
        work_nodes = None
        skills_node = None
        education_nodes = None
        hometown_nodes = response.xpath('//*[@id="pagelet_hometown"]/div/div/ul/li')
        eduwork_nodes = response.xpath('//*[@id="pagelet_eduwork"]/div/div')
        for node in eduwork_nodes:
            sign = node.xpath('div/span/text()').extract()
            if 'Work' in sign:
                work_nodes = node.xpath('ul/li')
            if 'Professional Skills' in sign:
                skills_node = node.xpath('ul/li/div')
            if 'Education' in sign:
                education_nodes = node.xpath('ul/li')

        favorites_nodes = response.xpath('//*[@id="favorites"]/div[2]/table/tbody')
        for node in favorites_nodes:
            sign = node.xpath('tr[1]/th/div/text()').extract()
            if 'Music' in sign:
                music_nodes = node.xpath('ul/li')
            if 'Other' in sign:
                others_node = node.xpath('ul/li/div')

        if work_nodes:
            for node in work_nodes:
                title = ""
                company = node.css('a::text').extract()
                other_info = node.css('div::text').extract()
                if other_info:
                    title = other_info[0]
                    other_info = other_info[1:]
                work_dict = {
                    'company': company[0],
                    'title': title,
                    'other_info': other_info,
                }
                work_list.append(work_dict)

        if skills_node:
            skill_list = skills_node.xpath('a/text()').extract()

        if education_nodes:
            for node in education_nodes:
                school = node.css('a::text').extract()
                other_info = node.css('div::text').extract()

                education_dict = {
                    "school": school[0],
                    "other_desc": other_info,
                }
                education_list.append(education_dict)
        if hometown_nodes:
            for node in hometown_nodes:
                addr = node.css('a::text').extract()
                desc = node.css('div::text').extract()

                hometown_dict = {
                    'addr': addr[0],
                    'desc': desc[0],
                }
                hometownlist.append(hometown_dict)

        res.update({
            'work_list': work_list,
            'hometownlist': hometownlist,
            'skill_list': skill_list,
            'education_list': education_list,
        })

        print(str(res).replace("'", '"'))
	# -- coding: utf-8 --
	import scrapy
	from uuid import uuid1
	import logging
	logger = logging.getLogger(__name__)


	class TiebaSpider(scrapy.Spider):
	name = "facebook"
	allowed_domains = ['www.facebook.com']
	start_urls = ['https://www.facebook.com/diana.liu.31521']
	# start_urls = ['https://www.facebook.com/yang.liu.96']

	def parse(self, response):
	res = {'id': str(uuid1()).replace('-', "")}
	username = response.xpath('//*[@id="fb-timeline-cover-name"]/a/text()').extract()
	if username:
	res.update({'username': username[0]})

	# 头像
	profile_pic_url_hd = response.xpath('//*[@id="fbTimelineHeadline"]/div[3]/div/div/div/img/@src').extract()
	if profile_pic_url_hd:
	res.update({'profile_pic_url_hd': profile_pic_url_hd[0]})

	# 背景图
	profile_pic_url_bg = response.xpath('//*[@id="fbCoverImageContainer"]/img[1]/@src').extract()
	if profile_pic_url_bg:
	res.update({'profile_pic_url_bg': profile_pic_url_bg[0]})

	# 个人简介
	person_desc = response.xpath('//*[@id="pagelet_bio"]/div/ul/li/div/div/span/text()').extract()
	if person_desc:
	res.update({'person_desc': ' '.join(person_desc)})

	# 格言
	quotes = response.xpath('//*[@id="pagelet_quotes"]/div/ul/li/div/div/span/text()').extract()
	if quotes:
	res.update({'quotes': ' '.join(quotes)})

	education_list = []
	work_list = []
	hometownlist = []
	skill_list = []
	work_nodes = None
	skills_node = None
	education_nodes = None
	hometown_nodes = response.xpath('//*[@id="pagelet_hometown"]/div/div/ul/li')
	eduwork_nodes = response.xpath('//*[@id="pagelet_eduwork"]/div/div')
	for node in eduwork_nodes:
	sign = node.xpath('div/span/text()').extract()
	if 'Work' in sign:
	work_nodes = node.xpath('ul/li')
	if 'Professional Skills' in sign:
	skills_node = node.xpath('ul/li/div')
	if 'Education' in sign:
	education_nodes = node.xpath('ul/li')

	favorites_nodes = response.xpath('//*[@id="favorites"]/div[2]/table/tbody')
	for node in favorites_nodes:
	sign = node.xpath('tr[1]/th/div/text()').extract()
	if 'Music' in sign:
	music_nodes = node.xpath('ul/li')
	if 'Other' in sign:
	others_node = node.xpath('ul/li/div')

	if work_nodes:
	for node in work_nodes:
	title = ""
	company = node.css('a::text').extract()
	other_info = node.css('div::text').extract()
	if other_info:
	title = other_info[0]
	other_info = other_info[1:]
	work_dict = {
	'company': company[0],
	'title': title,
	'other_info': other_info,
	}
	work_list.append(work_dict)

	if skills_node:
	skill_list = skills_node.xpath('a/text()').extract()

	if education_nodes:
	for node in education_nodes:
	school = node.css('a::text').extract()
	other_info = node.css('div::text').extract()

	education_dict = {
	"school": school[0],
	"other_desc": other_info,
	}
	education_list.append(education_dict)
	if hometown_nodes:
	for node in hometown_nodes:
	addr = node.css('a::text').extract()
	desc = node.css('div::text').extract()

	hometown_dict = {
	'addr': addr[0],
	'desc': desc[0],
	}
	hometownlist.append(hometown_dict)

	res.update({
	'work_list': work_list,
	'hometownlist': hometownlist,
	'skill_list': skill_list,
	'education_list': education_list,
	})

	print(str(res).replace("'", '"'))