cderinbogaz · March 20, 2020 15:07
diff --git a/namekrea_scraper.py b/namekrea_scraper.py
 from bs4 import BeautifulSoup
 import urllib3
 import ssl
 import pandas
 from multiprocessing import Process
 import tldextract

 ssl._create_default_https_context = ssl._create_unverified_context

 # Load data into pandas dataframe
 df = pandas.read_csv('./data/majestic_million.csv')

 # We will fetch and try to get the meta

 def fetch_meta(url):
    try:
        res = req.request('GET', str(url),headers=headers, timeout=1)
        soup = BeautifulSoup(res.data, 'html.parser')
        description = soup.find(attrs={'name': 'Description'})

    # If name description is big letters:
        if description == None:
            description = soup.find(attrs={'name': 'description'})

            if description == None:
                print('Context is empty, pass')
                meta_data = None
            else:
                content = description['content']
                url_clean = tldextract.extract(url)
                suffix = url_clean.suffix
                domain = url_clean.domain

                # Try to clean up websites with RU, JP, CN, PL we are trying to get only english trainig data.

                if suffix in ['com','org','ai','me','app','io','ly','co']:
                    print(url)
                    print(url_clean)
                    print(content)
                    meta_data = (str(content) + ' = @ = ' + str(domain) + '.' + str(suffix) + '\n')
                # Domains with weird tld's are not in our priority. We would like to keep our training data as clean as possible.
                else:
                    print('Domain suffix is low priority ' + str(url))
                    meta_data = None
                return meta_data
    except Exception as e:
        print(e)
	from bs4 import BeautifulSoup
	import urllib3
	import ssl
	import pandas
	from multiprocessing import Process
	import tldextract

	ssl._create_default_https_context = ssl._create_unverified_context

	# Load data into pandas dataframe
	df = pandas.read_csv('./data/majestic_million.csv')

	# We will fetch and try to get the meta

	def fetch_meta(url):
	try:
	res = req.request('GET', str(url),headers=headers, timeout=1)
	soup = BeautifulSoup(res.data, 'html.parser')
	description = soup.find(attrs={'name': 'Description'})

	# If name description is big letters:
	if description == None:
	description = soup.find(attrs={'name': 'description'})

	if description == None:
	print('Context is empty, pass')
	meta_data = None
	else:
	content = description['content']
	url_clean = tldextract.extract(url)
	suffix = url_clean.suffix
	domain = url_clean.domain

	# Try to clean up websites with RU, JP, CN, PL we are trying to get only english trainig data.

	if suffix in ['com','org','ai','me','app','io','ly','co']:
	print(url)
	print(url_clean)
	print(content)
	meta_data = (str(content) + ' = @ = ' + str(domain) + '.' + str(suffix) + '\n')
	# Domains with weird tld's are not in our priority. We would like to keep our training data as clean as possible.
	else:
	print('Domain suffix is low priority ' + str(url))
	meta_data = None
	return meta_data
	except Exception as e:
	print(e)