depp · February 26, 2019 15:56
diff --git a/scrape.py b/scrape.py
 # https://stackoverflow.com/questions/54876010
 from bs4 import BeautifulSoup
 import json
 import requests
 import sys
 from urllib.parse import urljoin

 def get_list(uri):
    """Scan the manga list for links to manga."""
    response = requests.get(uri)
    soup = BeautifulSoup(response.text, 'html.parser')
    for item in soup.find_all('span', class_=['manga-1', 'manga-2']):
        manga_uri = item.find('a')['href']
        yield urljoin(uri, manga_uri)

 def get_info(uri):
    """Get info for a manga."""
    response = requests.get(uri)
    soup = BeautifulSoup(response.text, 'html.parser')
    return {
        "muri": uri,
        # Find an <h1> tag, this is the title
        'mname': soup.find('h1').string,
        # Find a <h3> tag containing 'Tanıtım', the following text is the
        # summary
        'msubject': soup.find('h3', text='Tanıtım').next_sibling.string,
    }

 def run():
    all_manga = []
    num_errors = 0
    for uri in get_list('https://www.manga-tr.com/manga-list.html'):
        try:
            all_manga.append(get_info(uri))
        except Exception as ex:
            num_errors += 1
            print('Failed to get info for {}:'.format(uri), ex, file=sys.stderr)
            if num_errors >= 5:
                print('Too many errors, aborting', file=sys.stderr)
                break
        except KeyboardInterrupt:
            print('Ctrl-C, scrape canceled')
            break
        # Stop after getting 10 items (please be polite when scraping!)
        if len(all_manga) >= 10:
            break
    manga = {
        'manga': all_manga,
    }
    with open('allmanga.json', 'w', encoding='utf-8-sig') as outfile:
        json.dump(manga, outfile, indent=4)
        outfile.write('\n')

 if __name__ == '__main__':
    run()
	# https://stackoverflow.com/questions/54876010
	from bs4 import BeautifulSoup
	import json
	import requests
	import sys
	from urllib.parse import urljoin

	def get_list(uri):
	"""Scan the manga list for links to manga."""
	response = requests.get(uri)
	soup = BeautifulSoup(response.text, 'html.parser')
	for item in soup.find_all('span', class_=['manga-1', 'manga-2']):
	manga_uri = item.find('a')['href']
	yield urljoin(uri, manga_uri)

	def get_info(uri):
	"""Get info for a manga."""
	response = requests.get(uri)
	soup = BeautifulSoup(response.text, 'html.parser')
	return {
	"muri": uri,
	# Find an <h1> tag, this is the title
	'mname': soup.find('h1').string,
	# Find a <h3> tag containing 'Tanıtım', the following text is the
	# summary
	'msubject': soup.find('h3', text='Tanıtım').next_sibling.string,
	}

	def run():
	all_manga = []
	num_errors = 0
	for uri in get_list('https://www.manga-tr.com/manga-list.html'):
	try:
	all_manga.append(get_info(uri))
	except Exception as ex:
	num_errors += 1
	print('Failed to get info for {}:'.format(uri), ex, file=sys.stderr)
	if num_errors >= 5:
	print('Too many errors, aborting', file=sys.stderr)
	break
	except KeyboardInterrupt:
	print('Ctrl-C, scrape canceled')
	break
	# Stop after getting 10 items (please be polite when scraping!)
	if len(all_manga) >= 10:
	break
	manga = {
	'manga': all_manga,
	}
	with open('allmanga.json', 'w', encoding='utf-8-sig') as outfile:
	json.dump(manga, outfile, indent=4)
	outfile.write('\n')

	if __name__ == '__main__':
	run()
No results found