Skip to content

Instantly share code, notes, and snippets.

@cderinbogaz
Created March 20, 2020 15:07
Show Gist options
  • Save cderinbogaz/b73370dc98cddf099a16f22b815e65e5 to your computer and use it in GitHub Desktop.
Save cderinbogaz/b73370dc98cddf099a16f22b815e65e5 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import urllib3
import ssl
import pandas
from multiprocessing import Process
import tldextract
ssl._create_default_https_context = ssl._create_unverified_context
# Load data into pandas dataframe
df = pandas.read_csv('./data/majestic_million.csv')
# We will fetch and try to get the meta
def fetch_meta(url):
try:
res = req.request('GET', str(url),headers=headers, timeout=1)
soup = BeautifulSoup(res.data, 'html.parser')
description = soup.find(attrs={'name': 'Description'})
# If name description is big letters:
if description == None:
description = soup.find(attrs={'name': 'description'})
if description == None:
print('Context is empty, pass')
meta_data = None
else:
content = description['content']
url_clean = tldextract.extract(url)
suffix = url_clean.suffix
domain = url_clean.domain
# Try to clean up websites with RU, JP, CN, PL we are trying to get only english trainig data.
if suffix in ['com','org','ai','me','app','io','ly','co']:
print(url)
print(url_clean)
print(content)
meta_data = (str(content) + ' = @ = ' + str(domain) + '.' + str(suffix) + '\n')
# Domains with weird tld's are not in our priority. We would like to keep our training data as clean as possible.
else:
print('Domain suffix is low priority ' + str(url))
meta_data = None
return meta_data
except Exception as e:
print(e)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment