Last active
December 10, 2015 23:38
-
-
Save iantropov/4510390 to your computer and use it in GitHub Desktop.
First implementation of parser for Yandex`s result pages
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
class YandexRespParser(object): | |
__RESULTS_ON_PAGE = 10 #Its possible to evaluate this number from first result page | |
def __create_rep(self, result_element): | |
rep = {} | |
rep["index"] = int(result_element.xpath('.//b[@class="b-serp-item__number"]')[0].text_content()) | |
rep["href"] = str(result_element.xpath('.//a[@class="b-serp-item__title-link"]')[0].get("href")) | |
rep["domain"] = str(result_element.xpath('.//a[@class="b-serp-url__link"]')[0].text_content()) | |
rep["title"] = str(result_element.xpath('.//a[@class="b-serp-item__title-link"]')[0].text_content()) | |
rep["text"] = str(result_element.xpath('.//div[@class="b-serp-item__text"]')[0].text_content()) | |
rep["copy"] = str(result_element.xpath('.//a[@class="b-serp-item__links-link"]')[0].get("href")) | |
return rep | |
def __get_reps_from_page(self, page_number, results_from_page=10): | |
page = html.parse("http://ya.ru/yandsearch?text={0}&lr={1}&p={2}".format(self.query, self.lr, page_number)) | |
result_elements = page.xpath('//li[@class="b-serp-item i-bem"]') | |
reps_count = min(results_from_page, len(result_elements)) | |
return [self.__create_rep(result_elements[i]) for i in range(reps_count)] | |
def get_reps (self, count): | |
number_of_pages = count / self.__RESULTS_ON_PAGE | |
last_results = count % self.__RESULTS_ON_PAGE | |
first_search_results = sum(map(self.__get_reps_from_page, range(number_of_pages)), []) | |
last_search_results = self.__get_reps_from_page(number_of_pages, last_results) | |
return first_search_results + last_search_results | |
def __init__(self, query, lr): | |
self.query = query | |
self.lr = lr | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment