Skip to content

Instantly share code, notes, and snippets.

@metaperl
Created May 10, 2020 09:09
Show Gist options
  • Save metaperl/3d60de80faf20d1333f3abbec867ef8a to your computer and use it in GitHub Desktop.
Save metaperl/3d60de80faf20d1333f3abbec867ef8a to your computer and use it in GitHub Desktop.
import scrapy
from behold import Behold
import html_text
import durations
class SignalStartSpider(scrapy.Spider):
name = 'signalstart'
start_urls = [
'https://www.signalstart.com/search-signals',
]
def parse_details(self, response):
def split_field():
pass
class Details(scrapy.Item):
xpath = scrapy.Field()
extractor = scrapy.Field()
fields = {
'won': Details(),
'profit_factor': Details(),
'daily': Details(),
'monthly': Details()
}
fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
fields['won']['extractor'] = split_field
fields['profit_factor']['xpath'] = "//li[@class='list-group-item popovers']"
fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"
for field, field_processor in fields.items():
print(f" Process {field}")
elem = response.xpath(field_processor['xpath'])
response.meta["data_row"][field] = html_text.extract_text(elem.get())
def parse(self, response):
cols = "rank name gain pips drawdown trades type monthly chart price age added action"
skip = [7, 8, 11, 12]
def age_to_months(t):
t = t.replace('m', 'M')
d = durations.Duration(t);
return d.to_months()
postprocess = {
'age': lambda t: age_to_months(t)
}
td = dict()
for i, col in enumerate(cols.split()):
td[i] = col
Behold().show('td')
for provider in response.xpath("//div[@class='row']//tr"):
data_row = dict()
Behold().show('provider')
for i, datum in enumerate(provider.xpath('td')):
Behold().show('i', 'datum')
if i == 1: # name
url = datum.css("a::attr(href)").get()
yield scrapy.Request(url=url, callback=self.parse_details, meta={'data_row': data_row })
if i in skip:
print(".....skipping")
continue
text = html_text.extract_text(datum.get())
column_name = td[i]
if column_name in postprocess:
text = postprocess[column_name](text)
data_row[column_name] = text
print(f" ---> final data row: {data_row}")
yield data_row
# next_page = response.css('.fa-angle-right').get()
# if next_page is not None:
# yield response.follow(next_page, self.parse)
@metaperl
Copy link
Author

# -*- coding: utf-8 -*-
import scrapy
from behold import Behold
import html_text
import durations
from selenium import webdriver


URL_20 = "https://www.signalstart.com/search-signals"
URL_1000="https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=1000&p=1&z=0.410257937140464"

class Provider(scrapy.Item):
    rank = scrapy.Field()
    name = scrapy.Field()
    gain = scrapy.Field()
    pips = scrapy.Field()
    drawdown = scrapy.Field()
    trades = scrapy.Field()
    type = scrapy.Field()
    monthly = scrapy.Field()
    # chart = scrapy.Field()
    price = scrapy.Field()
    age = scrapy.Field()
    # added = scrapy.Field()
    # action = scrapy.Field()
    won = scrapy.Field()
    profit_factor = scrapy.Field()
    daily = scrapy.Field()
    monthly = scrapy.Field()

def raw_page_url(i=1):
    """
    Return raw page of 100 results. There are 8 such pages
    :param i: which page number
    :return:
    """
    return "https://www.signalstart.com/paging.html?pt=1&sb=48&st=1&ts=705&yieldType=&yieldVal=&drawType=&drawVal=&pipsType=&pipsVal=&type=&ageType=&tradesType=&tradesVal=&priceType=&priceVal=&fifoVal=&searchVal=&serversMultiSearch=&ps=100&p={}&z=0.024967722664414493".format(i)

class SignalStartSpider(scrapy.Spider):

    page = 1

    name = 'signalstart'
    start_urls = [
        # raw_page_url(page),
        URL_20
    ]

    def __init__(self):
        #self.driver = webdriver.Firefox(executable_path = r'C:\Users\terre\AppData\Local\Taurus\bin\geckodriver.exe')
        self.driver = webdriver.Firefox(executable_path=r'/cygdrive/c/Users/terre/AppData/Local/Taurus/bin/geckodriver.exe')

    def parse_details(self, response):

        class Details(scrapy.Item):
            xpath = scrapy.Field()
            extractor = scrapy.Field() # I thought different fields would be extracted differently. But turns out they dont.

        fields = {
            'won': Details(),
            'profit_factor': Details(),
            'daily': Details(),
            'monthly': Details()
        }

        fields['won']['xpath'] = "//li[contains(text(),'Won:')]"
        fields['profit_factor']['xpath'] = "//li[@class='list-group-item popovers']"
        fields['daily']['xpath'] = "//li[contains(text(),'Daily:')]"
        fields['monthly']['xpath'] = "//li[contains(text(),'Monthly:')]"

        for field, field_processor in fields.items():
            print(f"     Process {field}")
            elem = response.xpath(field_processor['xpath'])
            _, value = html_text.extract_text(elem.get()).split(':')
            response.meta["data_row"][field] = value
        yield response.meta["data_row"]

    def parse(self, response):

        print(" >>>>>> URL of the response object is {}".format(type(response)))
        self.driver.get(response.url)

        cols = "rank name gain pips drawdown trades type monthly chart price age added action"

        skip = [7, 8, 11, 12]

        def age_to_months(t):
            t = t.replace('m', 'M')
            d = durations.Duration(t);
            return d.to_months()

        postprocess = {
            'age': lambda t: age_to_months(t)
        }

        td = dict()
        for i, col in enumerate(cols.split()):
            td[i] = col

        Behold().show('td')

        for provider in response.xpath("//div[@class='row']//tr"):
            data_row = Provider()
            Behold().show('provider')
            details_url = None

            for i, datum in enumerate(provider.xpath('td')):
                Behold().show('i', 'datum')
                if i == 1: # name
                    details_url = datum.css("a::attr(href)").get()
                if i in skip:
                    print(".....skipping")
                    continue
                text = html_text.extract_text(datum.get())
                column_name = td[i]
                if column_name in postprocess:
                    text = postprocess[column_name](text)
                data_row[column_name] = text
            if details_url:
                yield scrapy.Request(url=details_url, callback=self.parse_details, meta={'data_row': data_row})

        print("------------------------------- next page logic --------------------------------------")

        next = self.driver.find_element_by_css_selector('.fa-angle-right')
        if next is not None:
            print(" **** NEXT IS -NOT- NONE")
            next.click()
            page_source = self.driver.page_source
            r = scrapy.http.HtmlResponse('_')
        else:
            print(" **** NEXT IS NONE")

        # next_page = response.css('.fa-angle-right').get()
        # if next_page is not None:
        #     yield response.follow(next_page, self.parse)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment