Oliver Batey oliver-batey

oliver-batey / sentence_sentiment.py

Last active November 28, 2021 19:59

Mean sentence sentiment

	import matplotlib.pyplot as plt
	import numpy as np
	import pandas as pd
	from textblob import TextBlob


	def sentiment_polarity(string: str) -> float:
	polarity = TextBlob(string).sentiment[0]
	return polarity

oliver-batey / bad_file_parsing.py

Last active January 19, 2021 09:42

Example bad file parsing function

	def serialise_file(document,format):

	if format =='txt':
	with open(document, 'r') as file:
	string = file.read().replace('\n', ' ')
	return string

	elif format == 'docx'
	#docx parsing code here

oliver-batey / factory_pattern_part1.py

Last active January 20, 2021 20:27

Part 1 of the factory method pattern

	import os
	from docx import Document

	class DocParser:
	def parse(self,document):
	parser = get_format(document)
	return parser(document)

	def get_format(document):
	format = os.path.splitext(document)[-1]

oliver-batey / common_interface.py

Last active January 21, 2021 15:58

Common interface for parsing txt, docx, pdf, html and pptx

	import os
	import io
	from docx import Document

	from pdfminer3.layout import LAParams, LTTextBox
	from pdfminer3.pdfpage import PDFPage
	from pdfminer3.pdfinterp import PDFResourceManager
	from pdfminer3.pdfinterp import PDFPageInterpreter
	from pdfminer3.converter import PDFPageAggregator
	from pdfminer3.converter import TextConverter

oliver-batey / using_common_interface.py

Last active January 20, 2021 23:27

How to use the common interface to parse different file types

	import parse_file as dp

	#define paths to test files
	txt_path = 'test_txt.txt'
	docx_path = 'test_docx.docx'
	pdf_path = 'test_pdf.pdf'
	html_path = 'test_html.html'
	pptx_path = 'test_pptx.pptx'

	file_paths = [txt_path,docx_path,pdf_path,html_path,pptx_path]