Skip to content

Instantly share code, notes, and snippets.

@narate
Last active October 22, 2015 10:53
Show Gist options
  • Save narate/324ad17d61a069788b95 to your computer and use it in GitHub Desktop.
Save narate/324ad17d61a069788b95 to your computer and use it in GitHub Desktop.
Thaischool.in.th sitemap scrapy spider.
# -*- coding: utf-8 -*-
import scrapy
from thaischool.items import ThaischoolItem
class SchoolSpider(scrapy.Spider):
name = "school"
allowed_domains = ["thaischool.in.th"]
base_url = 'http://www.thaischool.in.th/sitemap.php?page=%s&school_area=&province_id=&txtsearch='
start_urls = []
for i in range(1,236):
start_urls.append(base_url % i)
def parse(self, response):
for sel in response.xpath('/html/body/center/form/table[2]/tr[position()>1]'):
td = sel.xpath('td')
no = td[0].xpath('text()').extract()[0]
name = td[1].xpath('a/text()').extract()[0]
addr = td[2].xpath('text()').extract()
addr = addr[0] if len(addr) > 0 else '-'
phone = td[3].xpath('text()').extract()
phone = phone[0] if len(phone) > 0 else '-'
zone = td[4].xpath('text()').extract()
zone = zone[0] if len(zone) > 0 else '-'
item = ThaischoolItem({'no': no, 'name': name, 'address': addr, 'phone': phone, 'zone': zone })
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment