Created
March 20, 2020 15:07
-
-
Save cderinbogaz/b73370dc98cddf099a16f22b815e65e5 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import urllib3 | |
import ssl | |
import pandas | |
from multiprocessing import Process | |
import tldextract | |
ssl._create_default_https_context = ssl._create_unverified_context | |
# Load data into pandas dataframe | |
df = pandas.read_csv('./data/majestic_million.csv') | |
# We will fetch and try to get the meta | |
def fetch_meta(url): | |
try: | |
res = req.request('GET', str(url),headers=headers, timeout=1) | |
soup = BeautifulSoup(res.data, 'html.parser') | |
description = soup.find(attrs={'name': 'Description'}) | |
# If name description is big letters: | |
if description == None: | |
description = soup.find(attrs={'name': 'description'}) | |
if description == None: | |
print('Context is empty, pass') | |
meta_data = None | |
else: | |
content = description['content'] | |
url_clean = tldextract.extract(url) | |
suffix = url_clean.suffix | |
domain = url_clean.domain | |
# Try to clean up websites with RU, JP, CN, PL we are trying to get only english trainig data. | |
if suffix in ['com','org','ai','me','app','io','ly','co']: | |
print(url) | |
print(url_clean) | |
print(content) | |
meta_data = (str(content) + ' = @ = ' + str(domain) + '.' + str(suffix) + '\n') | |
# Domains with weird tld's are not in our priority. We would like to keep our training data as clean as possible. | |
else: | |
print('Domain suffix is low priority ' + str(url)) | |
meta_data = None | |
return meta_data | |
except Exception as e: | |
print(e) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment