Created
March 3, 2022 16:39
-
-
Save Bishwas-py/067055ce8c0d770956a554bf9d63ce50 to your computer and use it in GitHub Desktop.
webscraping and multiprocessing with py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_set_data(link, mp_data): | |
try: | |
articles = Article() # gets article information | |
article_data = { | |
'url': article.url, | |
'summary': article.summary, | |
'content': article.text, | |
'keywords': article.keywords, | |
'tags': article.tags | |
} | |
article_words = article.text.split(' ') | |
article_paragraphs = article.text.splitlines() | |
if len(article_paragraphs) <= 0: | |
word_paragraph_ratio = 0 | |
else: | |
word_paragraph_ratio = len(article_words) / len(article_paragraphs) | |
article_data['word_paragraph_ratio'] = word_paragraph_ratio | |
article_data['paragraph_count'] = len(article_paragraphs) | |
article_data['word_count'] = len(article_words) | |
# Article High Quality Detection | |
if len(article_words) >= 250: | |
if word_paragraph_ratio >= 6: | |
mp_data['site_info']['hq_pages_num'].value += 1 | |
article_data['hq'] = True | |
else: | |
article_data['hq'] = False | |
mp_data['articles'].append(article_data) # appender... | |
mp_data['site_info']['pages_num'].value += 1 | |
except ArticleException: | |
pass |
More...
mp_data = manager.dict()
mp_data.update(
{
'site_info': {
'pages_num': manager.Value('i', 0),
'hq_pages_num': manager.Value('i', 0)
},
'articles': manager.list()
}
)
processes = []
added_articles = []
for i, link in enumerate(links):
if link not in added_articles:
process = Process(target=get_set_data, args=(link, mp_data,))
processes.append(process)
process.start()
added_articles.append(link)
for process in processes:
print("Before Joining: ", process.is_alive())
process.join()
print("After Joining: ", process.is_alive())
data = {
'site_info': {
'pages_num': mp_data['site_info']['pages_num'].value,
'hq_pages_num': mp_data['site_info']['hq_pages_num'].value
},
'articles': list(mp_data['articles'])
}
mp_data['articles']=mp_data['articles'] + article_data
from multiprocessing import Process, Manager
def f(d):
d[1] = d[1] + [4]
d[1]=d[1]+[5]
d[1]=d[1]+['fjasfkjs']
print(d)
if name == 'main':
manager = Manager() # create only 1 mgr
d = manager.dict() # create only 1 dict
d[1] = []
p = Process(target=f,args=(d,)) # say to 'f', in which 'd' it should append
p.start()
p.join()
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Here's mp_data: