amrakm’s gists

amrakm / create_rd37_env.sh

Last active September 21, 2022 09:13

create conda env from shell script

	# # create a file `create_rd37_env.sh` and add this snippet to it
	# get conda path using `which conda`

	/home/ec2-user/anaconda3/condabin/conda init bash
	conda create -y --name rd37 python=3.7
	conda activate rd37
	conda install -y ipykernel
	python -m ipykernel install --user --name rd37 --display-name "rd37"

	conda install -y -c conda-forge cudatoolkit=11.2 cudnn=8.1.0

amrakm / torch_data_loader_w_corrupted_imgs.md

Created September 7, 2022 20:50

torch_data_loader_w_corrupted_imgs.md

Follow these steps in order to handle corrupted images:

Return None in the getitem() if the image is corrupted

def __getitem__(self, idx):
    try:
 img, label = load_img(idx)

amrakm / cluster_images.py

Last active September 2, 2022 16:10

cluster images using CLIP embeddings

	# https://github.com/MaartenGr/Concept
	# !pip install concept umap-learn matplotlib

	import glob
	import hdbscan
	import umap
	from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
	from concept import ConceptModel

	imgs_folder_path = './imgs'

amrakm / load_json_from_s3.py

Created August 24, 2022 15:13

load json from s3

	import boto3
	import json

	s3_bucket = 'production-db'
	s3 = boto3.client('s3')

	def get_json_from_s3(key: str, bucket=s3_bucket):
	"""
	Retrieves the json file containing responses from s3. returns a dict

amrakm / download_url_parallel.py

Created July 1, 2022 13:16

download url parallel

	import eventlet,sys
	# note: this urllib import doesn't work in Python2
	from eventlet.green.urllib.request import urlopen

	file = sys.argv[1] # list of urls
	with open(file,'r') as f:
	urls = [x.rstrip() for x in f.readlines()]
	urls = urls + urls
	urls = urls + urls
	urls = urls + urls

amrakm / restart_crashing_python_script.py

Last active June 20, 2022 13:58

restart python script if it crashes

	# https://stackoverflow.com/a/63021289/5554394

	from subprocess import run
	from time import sleep

	# Path and name to the script you are trying to start
	file_path = "test.py"

	restart_timer = 2
	def start_script():

amrakm / filter_nouns_only.py

Last active May 26, 2022 11:19

filter nouns only

	import nltk
	import nltk
	nltk.download('averaged_perceptron_tagger')
	lines = 'lines is some string of words'

	def filter_nouns_only(text):
	tokenized = nltk.word_tokenize(text)
	nouns = [word for (word, pos) in nltk.pos_tag(tokenized) if(pos[:2] == 'NN')]
	return " ".join(nouns)

amrakm / absa.py

Last active May 26, 2022 10:48

aspect based sentiment analysis

	# pip install git+https://github.com/ScalaConsultants/Aspect-Based-Sentiment-Analysis

	import aspect_based_sentiment_analysis as absa

	nlp = absa.load()
	text = ("We are great fans of Slack, but we wish the subscriptions "
	"were more accessible to small startups.")

	slack, price = nlp(text, aspects=['slack', 'price'])
	print(slack.sentiment, slack.sentiment.value)

amrakm / extract_sentence_that_contain_keyword.py

Created May 26, 2022 10:16

find sentences that contains a keyword - stemmed string match

	from nltk.stem.porter import PorterStemmer
	import re


	def extract_sentence_that_contain_keyword(keyword, text):

	stemmer = PorterStemmer()

	stemmed_keyword = stemmer.stem(keyword)
	stemmed_text = ' '.join([stemmer.stem(x) for x in text.split()])

amrakm / clean_html.py

Created April 29, 2022 11:34

clean text from html tags

	import re

	def cleanhtml(raw_html):

	#Some HTML texts can also contain entities that are not enclosed in brackets, such as '&nsbm'. If that is the case, then you might want to write the regex as
	CLEANR = re.compile('<.*?>\|&([a-z0-9]+\|#[0-9]{1,6}\|#x[0-9a-f]{1,6});')

	# as per recommendation from @freylis, compile once only
	CLEANR = re.compile('<.*?>')
	cleantext = re.sub(CLEANR, '', raw_html)

Amr Mashlah amrakm