Skip to content

Instantly share code, notes, and snippets.

@depp
Created February 26, 2019 15:56
Show Gist options
  • Select an option

  • Save depp/a41ca728669888c085bc47b1397b4328 to your computer and use it in GitHub Desktop.

Select an option

Save depp/a41ca728669888c085bc47b1397b4328 to your computer and use it in GitHub Desktop.
Scraping with Python
# https://stackoverflow.com/questions/54876010
from bs4 import BeautifulSoup
import json
import requests
import sys
from urllib.parse import urljoin
def get_list(uri):
"""Scan the manga list for links to manga."""
response = requests.get(uri)
soup = BeautifulSoup(response.text, 'html.parser')
for item in soup.find_all('span', class_=['manga-1', 'manga-2']):
manga_uri = item.find('a')['href']
yield urljoin(uri, manga_uri)
def get_info(uri):
"""Get info for a manga."""
response = requests.get(uri)
soup = BeautifulSoup(response.text, 'html.parser')
return {
"muri": uri,
# Find an <h1> tag, this is the title
'mname': soup.find('h1').string,
# Find a <h3> tag containing 'Tanıtım', the following text is the
# summary
'msubject': soup.find('h3', text='Tanıtım').next_sibling.string,
}
def run():
all_manga = []
num_errors = 0
for uri in get_list('https://www.manga-tr.com/manga-list.html'):
try:
all_manga.append(get_info(uri))
except Exception as ex:
num_errors += 1
print('Failed to get info for {}:'.format(uri), ex, file=sys.stderr)
if num_errors >= 5:
print('Too many errors, aborting', file=sys.stderr)
break
except KeyboardInterrupt:
print('Ctrl-C, scrape canceled')
break
# Stop after getting 10 items (please be polite when scraping!)
if len(all_manga) >= 10:
break
manga = {
'manga': all_manga,
}
with open('allmanga.json', 'w', encoding='utf-8-sig') as outfile:
json.dump(manga, outfile, indent=4)
outfile.write('\n')
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment