Created
January 3, 2020 10:13
-
-
Save tracylemke/9102e825d8bd99119f2c664a7051c9bc to your computer and use it in GitHub Desktop.
Web Scraper in Python: scrapes a table
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import html | |
from lxml.html.clean import Cleaner | |
import requests | |
def remove_duplicates(mylist): | |
return list(dict.fromkeys(mylist)) | |
cleaner = Cleaner() | |
cleaner.javascript = True | |
# Scrape list of games | |
### -- list of sample data -- ### | |
# url = "http://localhost/test/data.php" | |
# path = '//tbody[@id="databody"]//a[@target="_parent"]/text()' | |
# url = "https://www.mmorpg.com/games-list" | |
# path = '//div[@class="iside"]//a/text()' | |
url = 'https://en.wikipedia.org/wiki/List_of_Xbox_games' | |
path = '//td//a[@class="mw-redirect"]/text()' | |
page = requests.get(url) | |
data = page.text | |
tree = html.fromstring(data) | |
titles = tree.xpath(path) | |
print(len(titles)) | |
for game_title in titles: | |
print(f"'{game_title}'") | |
genres = tree.xpath('//table[@id="softwarelist"]//tr//td//a/text()') | |
total_uniques = len(remove_duplicates(genres)) | |
print("There are {} genres but {} are unique " . format(len(genres), len(remove_duplicates(genres)))) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment