This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import glob | |
import json | |
from tqdm import tqdm | |
def cleanhtml(raw_html): | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', raw_html) | |
return cleantext |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import itertools | |
from gensim.models import Word2Vec | |
from sklearn.manifold import TSNE | |
import matplotlib.pyplot as plt | |
import matplotlib.font_manager as fm | |
def tsne_plot(model, vocab): | |
"Creates and TSNE model and plots it" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
def get_meta_from_json(json_file): | |
df = pd.read_json(json_file) | |
df = df.T | |
return df | |
if __name__=="__main__": | |
json_file = "myjson.json" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Dependencies: | |
$sudo apt-get install libfreetype6-dev libharfbuzz-dev libfribidi-dev gtk-doc-tools | |
$git clone https://github.com/python-pillow/Pillow.git | |
$cd Pillow/depends | |
$chmod +x install_raqm.sh | |
$./install_raqm.sh | |
$pip install pillow | |
""" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
text = "I live in Bangladesh.\n\n\nBangladesh is a beautiful country.\n\nI love my country." | |
res = re.sub(r'\n+', '\n',text) | |
print(res) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from tqdm import tqdm | |
count = 0 | |
for root, dirs, files in tqdm(os.walk("/path")): | |
for file in files: | |
if file.endswith(".txt"): | |
# print(file) | |
filename = "files/text_{}".format(count) | |
output = open(filename, "w") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Removing HTML Tag from text using regex | |
# code ref: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string | |
import re | |
def cleanhtml(raw_html): | |
cleanr = re.compile('<.*?>') | |
cleantext = re.sub(cleanr, '', raw_html) | |
return cleantext |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
text = "Hello123 with 563" | |
result = re.findall(r"\d+", text) | |
print(result) | |
# output: ['123', '563'] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
TEXTO = sys.argv[1] | |
my_regex = r"\b(?=\w)" + re.escape(TEXTO) + r"\b(?!\w)" | |
result = re.search(my_regex, subject, re.IGNORECASE) | |
print(result) | |
# ref | |
# https://stackoverflow.com/questions/6930982/how-to-use-a-variable-inside-a-regular-expression |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Dependecies | |
# python 3 | |
# pip install fpdf | |
# to run: python fpdf.py | |
from fpdf import FPDF | |
from PIL import Image | |
import os | |
listPages = os.listdir("images") |