Created
July 24, 2017 10:21
-
-
Save jkotra/26bbe1261c066f2fabbac279516e2b27 to your computer and use it in GitHub Desktop.
SoccerScraper.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from urllib.request import Request, urlopen # Python 3 | |
from bs4 import BeautifulSoup | |
country = [] | |
team = [] | |
names1 = [] | |
names2 = [] | |
names3 = [] | |
links1 = [] | |
links2 = [] | |
def csv_writer(player_id): | |
with open('output.csv', 'a') as csvfile: | |
spamwriter = csv.writer(csvfile, delimiter=',', | |
quotechar='"', quoting=csv.QUOTE_MINIMAL) | |
spamwriter.writerow([country[player_id], team[player_id],names1[player_id], links1[player_id], links2[player_id]]) | |
def url_scraper(player_name): | |
url_player_name = names3[player_name] | |
url_player = 'https://en.soccerwiki.org/wiki.php?action=search&searchType=all&q=%s' % url_player_name | |
print(url_player) | |
url = Request(url_player) | |
url.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0') | |
url.add_header('Referer', 'https://en.soccerwiki.org/') | |
url2 = urlopen(url).read() | |
soup = BeautifulSoup(url2, 'html.parser') | |
links_soccer_wiki = [(n['href']) for n in soup.findAll('a')] | |
profile_link_1 = links_soccer_wiki[17] | |
if profile_link_1 == 'http://www.grassrootsoccer.org/': | |
profile_link_1 = "Not found" | |
print(profile_link_1) | |
links1.append(profile_link_1) | |
else: | |
profile_link_1 = "https://en.soccerwiki.org/" + profile_link_1 | |
print(profile_link_1) | |
links1.append(profile_link_1) | |
def url_scraper_2(player_name): | |
url_player_name = names3[player_name] | |
url_player = 'https://www.transfermarkt.co.uk/schnellsuche/ergebnis/schnellsuche?query=%s' % url_player_name | |
print(url_player) | |
url = Request(url_player) | |
url.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:54.0) Gecko/20100101 Firefox/54.0') | |
url.add_header('Referer', 'https://www.transfermarkt.co.uk/') | |
url2 = urlopen(url).read() | |
url_d_chk = urlopen(url) | |
url_d_chk2 = url_d_chk.read() | |
slot1 = [] | |
soup = BeautifulSoup(url2, 'html.parser') | |
for link in soup.find_all('a'): | |
slot1.append(link.get('href')) | |
slot1_url = "https://www.transfermarkt.co.uk" + slot1[-13] | |
url3 = url_d_chk2.decode('utf-8') | |
dc0 = '2 Hits' in url3 | |
print("result is" + str(dc0) + ".") | |
if dc0 == True: | |
dc1 = team[player_name] in slot1[-14] | |
print (dc1) | |
if dc1 == False: | |
slot1_url="https://www.transfermarkt.co.uk" + slot1[-14] | |
print ("Fixedurl:" + slot1_url) | |
dc2 = team[player_name] in slot1[-9] | |
if dc2 == True: | |
slot1_url="https://www.transfermarkt.co.uk" + slot1[-9] | |
if slot1_url == "https://www.transfermarkt.co.uk/profil/einstellungen": | |
slot1_url = "Not found" | |
if slot1_url == "https://www.transfermarkt.co.uk#": | |
slot1_url = "Not found" | |
links2.append(slot1_url) | |
def name_processor(player_index): | |
replace_comma = names1[player_index].replace(",", "") | |
names_p1 = replace_comma.split() | |
names2.append(names_p1) | |
def name_processor_2(player_index): | |
names_p2 = names2[player_index] | |
names_p3 = names_p2[-1] + "+" + names_p2[0] | |
names3.append(names_p3) | |
#def openpage(): | |
def count(num): | |
print (1 + num) | |
print ("----------SoccerScraper---------") | |
f1_read = open("engsoc.csv") | |
f1_csv = csv.reader(f1_read) | |
for row in f1_csv: | |
names1.append(row[2]) | |
country.append(row[0]) | |
team.append(row[1]) | |
f1_read.close() | |
list_len = len(names1) | |
for i in range(list_len): | |
name_processor(i) | |
for i in range(list_len): | |
name_processor_2(i) | |
for i in range(list_len): | |
url_scraper(i) | |
for i in range(list_len): | |
url_scraper_2(i) | |
for i in range(len(links1)): | |
csv_writer(i) | |
exit() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment