Skip to content

Instantly share code, notes, and snippets.

@jpopesculian
Created March 14, 2015 22:14
Show Gist options
  • Save jpopesculian/62220fcb3afc603b0a80 to your computer and use it in GitHub Desktop.
Save jpopesculian/62220fcb3afc603b0a80 to your computer and use it in GitHub Desktop.
ETFSpider
from scrapy import Spider, Request
from project.items import ETFItem
class ETFSpider(Spider):
name = "etf"
allowed_domains = ["http://finance.yahoo.com/"]
SETTINGS = {
"url": "http://finance.yahoo.com/etf/lists?mod_id=mediaquotesetf&tab=tab6&scol=nasset&stype=desc&rcnt=100&page=",
"num_pages": 17
}
def start_requests(self):
url = self.SETTINGS["url"]
for i in range(1, self.SETTINGS["num_pages"]):
request = Request(url+str(i), callback=self.parse)
yield request
def get_field(self, row, name):
value = row.css('td.' + name + '::text')
if not value:
value = row.css('td.' + name + ' span::text')
if not value:
value = row.css('td.' + name + ' a::text')
if not value:
return ""
return value.extract()[0]
def parse(self, response):
rows = response.css('div.yfi-table-container table tbody tr')
for row in rows:
item = ETFItem()
for field in item.fields:
item[field] = self.get_field(row, field)
yield item
from scrapy import Item, Field
class ETFItem(Item):
fname = Field()
tkr = Field()
cat = Field()
ffly = Field()
nasset = Field()
eratio = Field()
tratio = Field()
ltype = Field()
idate = Field()
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment