Octoparse octoparse

Octoparse is a free, multi-award winning web scraping software to turn websites into structured data without coding. #WebScraping #DataCollection

octoparse / nfl1.py

Created November 11, 2019 04:09

Scraping fantasy football projections

	from bs4 import BeautifulSoup

	import re
	import requests


	def get_html_data(url):
	response = requests.get(url)
	return BeautifulSoup(response.content, "html5lib")

octoparse / gender_analysis_on_movies.py

Last active May 9, 2019 01:24

Data Science: What is the near future of Superheroines?

	import collections
	import re


	def get_first_name(aString):
	if not aString:
	return aString
	ss = aString.replace('', '').split(' ') # ['Leonard', 'NimoyChris', 'PineZachary', 'QuintoZoe', 'SaldanaKarl']
	name_list = [] # result returned for this function
	for name in ss:

octoparse / c5a44b9288ab-code.py

Created April 16, 2019 02:51

	import re
	import json

	# save the positive words into a list called p_list
	with open('positive.txt') as f:
	p_txt = f.read()
	p_txt = re.sub('[,\.()":;!@#$%^&*\d]\|\'s\|\'', '', p_txt)
	p_list = p_txt.replace('\n',' ').replace(' ',' ').lower().split(' ')
	# test if cool is in the list
	print 'cool is in the postive list: ', 'cool' in p_list

octoparse / c5a44b9288ab-part5.py

Created April 16, 2019 02:43

	# count if it is a positive word
	if word in p_list:
	if word in word_count_positive.keys():
	word_count_positive[word] += 1
	else:
	word_count_positive[word] = 1
	# else see if it is a negative word
	elif word in n_list:
	if word in word_count_negative.keys():
	word_count_negative[word] += 1

octoparse / c5a44b9288ab-part4.py

Created April 16, 2019 02:43

	for word in word_list:
	# count all words frequency
	if word in word_count_dict.keys():
	word_count_dict[word] += 1
	else:
	word_count_dict[word] = 1

octoparse / c5a44b9288ab-part3.py

Created April 16, 2019 02:42

octoparse / c5a44b9288ab-part2.py

Created April 16, 2019 02:42

	txt = f.read()
	txt = re.sub('[,\.()":;!@#$%^&*\d]\|\'s\|\'', '', txt)
	word_list = txt.replace('\n',' ').replace(' ',' ').lower().split(' ')

octoparse / c5a44b9288ab-part1.py

Created April 16, 2019 02:41

	import re
	import json
	with open('positive.txt') as f:
	p_txt = f.read()
	p_txt = re.sub('[,\.()":;!@#$%^&*\d]\|\'s\|\'', '', p_txt)
	p_list = p_txt.replace('\n',' ').replace(' ',' ').lower().split(' ')
	# test if cool is in the list
	print 'cool is in the postive list: ', 'cool' in p_list
	with open('negative.txt') as f:
	n_txt = f.read()