Last active
December 24, 2019 05:12
-
-
Save JeyKJey/7927b27cf7a831c48e223b7c06fbd401 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import multiprocessing as multi | |
def chunks(n, page_list): | |
"""Splits the list into n chunks""" | |
return np.array_split(page_list,n) | |
cpus = multi.cpu_count() | |
workers = [] | |
page_list = ['www.website.com/page1.html', 'www.website.com/page2.html' | |
'www.website.com/page3.html', 'www.website.com/page4.html'] | |
page_bins = chunks(cpus, page_list) | |
for cpu in range(cpus): | |
sys.stdout.write("CPU " + str(cpu) + "\n") | |
# Process that will send corresponding list of pages | |
# to the function perform_extraction | |
worker = multi.Process(name=str(cpu), | |
target=perform_extraction, | |
args=(page_bins[cpu],)) | |
worker.start() | |
workers.append(worker) | |
for worker in workers: | |
worker.join() | |
def perform_extraction(page_ranges): | |
"""Extracts data, does preprocessing, writes the data""" | |
# do requests and BeautifulSoup | |
# preprocess the data | |
file_name = multi.current_process().name+'.txt' | |
# write into current process file |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi how can i use this codes for scraping data from energyhelpline.com