Last active
December 26, 2016 12:37
-
-
Save filinvadim/d896492f6ec4852d84a515e7918e29b7 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import urllib3 | |
import bs4 | |
import requests | |
from lxml import html | |
import argparse | |
source_url = 'http://python.org' | |
def get_python_events_bysoup(url): | |
"""Grabs from url dates and events by BeautifulSoup method | |
Args: | |
param1 (string): URL to be parsed | |
Returns: | |
dict: Dictionary with events and dates""" | |
result_dict = {} | |
read_url = urllib3.PoolManager() | |
read_url = read_url.request(method='GET', url=url).data | |
soup = bs4.BeautifulSoup(read_url, 'html.parser') | |
for line in soup.find(class_='event-widget').find_all('li'): | |
result_dict.update({line.time['datetime'].split('T')[0]: line.a.text}) | |
return result_dict | |
def get_python_events_byxpath(url): | |
"""Grabs from url dates and events by Xpath method | |
Args: | |
param1 (string): URL to be parsed | |
Returns: | |
dict: Dictionary with events and dates""" | |
result_dict = {} | |
read_url = requests.get(url) | |
parsed_body = html.document_fromstring(read_url.text) | |
parsed_time = parsed_body.xpath('//div[@class="medium-widget event-widget last"]/div/ul/li/time/@datetime') | |
parsed_event = parsed_body.xpath('//div[@class="medium-widget event-widget last"]/div/ul/li/a/text()') | |
for result in zip(parsed_time, parsed_event): | |
result_dict.update({result[0].split('T')[0]: result[1]}) | |
return result_dict | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--seek', type=str, help='only_link') | |
args = parser.parse_args() | |
soup_args = get_python_events_bysoup(args.seek) | |
xpath_args = get_python_events_byxpath(args.seek) | |
print(soup_args, xpath_args) | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
import unittest
import dom_parser
class TestForSoupGrabber(unittest.TestCase):
def setUp(self):
self.url = 'http://python.org'
if name == 'main':
unittest.main()