StarJade-Park · May 4, 2017 05:14
diff --git a/crawler.py b/crawler.py
 from bs4 import BeautifulSoup
 from progressbar import Bar, SimpleProgress, Percentage, ProgressBar
 import urllib

 blog1 = "blog_url"    # url input

 input1 = 1            # temporary input var
 input2 = 41

 def replaceTxt(f, txt):
    if txt.find("#") is not -1:
        txt = txt.replace("_", "\\_")
        f.write("\\" + txt)
        return True
    else:
        return False

 def spider(max_pages):

    page = input1

    # virtural headers
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1;)'
    headers = {'User-Agent': user_agent, }

    # progressBar widgets def
    widgets = ['Running: ', Percentage(), ' ',
               Bar(marker='#', left='[', right=']'),
               ' ', SimpleProgress()]
    fail_list = list()
    pbar = ProgressBar(widgets=widgets, maxval=max_pages).start()
    while page <= max_pages:
        url = blog1 + str(page)

        # request
        request_url = urllib.request.Request(url, None, headers)

        # try open
        try:
            url_open = urllib.request.urlopen(request_url)
        except:
            fail_list.append(url)
            page += 1
            continue
        # read page
        source_code = url_open.read()
        soup = BeautifulSoup(source_code, 'lxml')

        f = open(page.__str__() + ".md", "w")

        # title
        for link in soup.select('h2 > a'):
            title = link.string
            f.write("#" + title + "\n")

        # paragraph
        for paragraph in soup.select('p'):

            # case: p > span
            for span in paragraph.select('span'):
                txt = span.string
                if txt == None:
                    continue

                if replaceTxt(f, txt):
                    continue

                f.write(txt)
            f.write("\n")

            # case: p
            txt = paragraph.string
            if txt == None:
                continue

            if replaceTxt(f, txt):
                continue
            f.write(txt)
        # end for paragraph

        # writing end, next page
        f.close()
        pbar.update(page)
        page += 1

    print("\ncrawling is complete.")
    print("fail pages:", fail_list)


 spider(input2)
	from bs4 import BeautifulSoup
	from progressbar import Bar, SimpleProgress, Percentage, ProgressBar
	import urllib

	blog1 = "blog_url" # url input

	input1 = 1 # temporary input var
	input2 = 41

	def replaceTxt(f, txt):
	if txt.find("#") is not -1:
	txt = txt.replace("_", "\\_")
	f.write("\\" + txt)
	return True
	else:
	return False

	def spider(max_pages):

	page = input1

	# virtural headers
	user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1;)'
	headers = {'User-Agent': user_agent, }

	# progressBar widgets def
	widgets = ['Running: ', Percentage(), ' ',
	Bar(marker='#', left='[', right=']'),
	' ', SimpleProgress()]
	fail_list = list()
	pbar = ProgressBar(widgets=widgets, maxval=max_pages).start()
	while page <= max_pages:
	url = blog1 + str(page)

	# request
	request_url = urllib.request.Request(url, None, headers)

	# try open
	try:
	url_open = urllib.request.urlopen(request_url)
	except:
	fail_list.append(url)
	page += 1
	continue
	# read page
	source_code = url_open.read()
	soup = BeautifulSoup(source_code, 'lxml')

	f = open(page.__str__() + ".md", "w")

	# title
	for link in soup.select('h2 > a'):
	title = link.string
	f.write("#" + title + "\n")

	# paragraph
	for paragraph in soup.select('p'):

	# case: p > span
	for span in paragraph.select('span'):
	txt = span.string
	if txt == None:
	continue

	if replaceTxt(f, txt):
	continue

	f.write(txt)
	f.write("\n")

	# case: p
	txt = paragraph.string
	if txt == None:
	continue

	if replaceTxt(f, txt):
	continue
	f.write(txt)
	# end for paragraph

	# writing end, next page
	f.close()
	pbar.update(page)
	page += 1

	print("\ncrawling is complete.")
	print("fail pages:", fail_list)


	spider(input2)