Created
January 17, 2016 15:03
-
-
Save RobertMatkulcik/3d073be7de64738c3933 to your computer and use it in GitHub Desktop.
data scraping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
from bs4 import BeautifulSoup | |
from requests import Session | |
import csv | |
from selenium.webdriver import Firefox | |
from selenium.common.exceptions import NoSuchElementException | |
def titulky_content(session, url): | |
content = session.get(url).content | |
soup = BeautifulSoup(content, "html.parser") | |
# get element with desired info | |
el = soup.findAll("tr", class_="r1") | |
el += soup.findAll("tr", class_="r") | |
return el | |
def extract_film(film): | |
film_url = film.find("a") | |
film_name = film_url.text | |
film_href = film_url.get("href") | |
return [film_name, film_href] | |
# save to cvs | |
def csv_writer(filename, el): | |
with open(filename, "w") as file: | |
file_writer = csv.writer(file) | |
file_writer.writerow(["Názov", "Url"]) | |
for el in el: | |
data = extract_film(el) | |
file_writer.writerow(data) | |
# read from csv | |
def csv_reader(filename): | |
with open(filename, "r") as file: | |
for row in file: | |
nozov_list = row | |
print(nozov_list) | |
# save csv-name to array | |
def to_array(filename): | |
makes = [] | |
with open(filename, 'r') as f: | |
reader = csv.reader(f) | |
next(reader) # Ignore first row | |
for row in reader: | |
makes.append(row[0]) | |
return makes | |
def get_driver(): | |
# create driver instance | |
driver = Firefox() | |
return driver | |
def drive(name_array, csfd_url, int_i): | |
driver = get_driver() | |
driver.get(csfd_url) | |
csfd_search = driver.find_element_by_class_name("text") | |
csfd_search.send_keys(name_array[int_i]) | |
driver.find_element_by_class_name("submit").click() | |
try: | |
csfd_vyhladane_prvy = driver.find_element_by_xpath('//*[@id="search-films"]/div[1]/ul[1]/li[1]/div/h3/a') | |
csfd_vyhladane_prvy.click() | |
except NoSuchElementException: | |
print("NoSuchElementException") | |
csfd_average = driver.find_element_by_class_name("average").text | |
csfd_average = re.sub('[!%]', '', csfd_average) | |
driver.quit() | |
return csfd_average | |
# def twod_list(columns, rows): | |
# a = [[x for x in range(columns)] for y in range(rows)] | |
# return a | |
def main(username, password): | |
##TITULKY.COM | |
login_url = "http://www.titulky.com/" | |
najnovsie_titulky_url = "http://www.titulky.com/?orderby=3&OrderDate=2" | |
##pre CSFD.COM | |
csfd_url = "http://www.csfd.cz/" | |
# output file csv | |
filename = "output.csv" | |
##TITULKY BEGIN | |
post_data = { | |
"Login": username, | |
"Password": password | |
} | |
# create session and perform login | |
session = Session() | |
session.post(login_url, post_data) | |
# visit my account page | |
el = titulky_content(session, najnovsie_titulky_url) | |
csv_writer(filename, el) | |
# csv_reader(filename) | |
##TITULKY END | |
##CSFD BEGIN | |
name_array = to_array(filename) | |
for int_i in range(len(name_array)): | |
try: | |
csfd_average = drive(name_array, csfd_url, int_i) | |
print(csfd_average) | |
except: | |
print("chyba nenasiel sa ziaden element") | |
break | |
##CSFD END | |
if __name__ == '__main__': | |
username = "E.T.Bong" | |
password = input() | |
main(username, password) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment