Sagor Sarker sagorbrur

🎯

Focusing

An enthusiastic NLP/AI/ML practitioner.

sagorbrur / xml_to_json.py

Last active June 27, 2019 07:17

	"""
	----------------------------------------------
	\| Convert XML file to JSON file Using Python \|
	\| Writer: Sagor Sarker \|
	\| Date : 27 June 2019 \|
	----------------------------------------------

	"""

sagorbrur / merge_json.py

Created June 29, 2019 11:36

	"""
	Name: Merging multiple json file into one json file
	Date: 29/06/2019
	Writer: Sagor Sarker
	"""

	import glob
	import json

sagorbrur / remove_english_word.py

Created September 26, 2019 07:17

	# Removing english word from non english document

	import re

	doc = """
	নতুন করে কিছু english শব্দ নিয়ে সমস্যা তৈরি হয়েছে। বাক্যের মধ্যে এই English শব্দ খুবেই বিরক্তির উদ্রেক করছে।
	Just figure out to remove these words
	"""
	import re
	result = re.sub(r'[A-Za-z]', '', doc)

sagorbrur / sort_glob_list.py

Created September 26, 2019 07:46

	# glob list has a sorting problem. this scrip will solve this issue.

	import glob


	files = glob.glob('test/*.txt')

	files.sort(key=lambda var:[int(x) if x.isdigit() else x for x in re.findall(r'[^0-9]\|[0-9]+', var)])

	for file in files:

sagorbrur / image_2_pdf.py

Created October 20, 2019 09:20

sagorbrur / variable_regex.py

Created October 31, 2019 08:47

	import re

	TEXTO = sys.argv[1]
	my_regex = r"\b(?=\w)" + re.escape(TEXTO) + r"\b(?!\w)"

	result = re.search(my_regex, subject, re.IGNORECASE)
	print(result)

	# ref
	# https://stackoverflow.com/questions/6930982/how-to-use-a-variable-inside-a-regular-expression

sagorbrur / digit_separate_regex.py

Created November 19, 2019 18:05

sagorbrur / remove_html_tag.py

Created November 30, 2019 09:01

	# Removing HTML Tag from text using regex

	# code ref: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string

	import re

	def cleanhtml(raw_html):
	cleanr = re.compile('<.*?>')
	cleantext = re.sub(cleanr, '', raw_html)
	return cleantext

sagorbrur / select_all_using_ext.py

Created December 3, 2019 03:56

	import os
	from tqdm import tqdm
	count = 0

	for root, dirs, files in tqdm(os.walk("/path")):
	for file in files:
	if file.endswith(".txt"):
	# print(file)
	filename = "files/text_{}".format(count)
	output = open(filename, "w")

sagorbrur / replace_multiple_newline_with_single_newline.py

Created December 23, 2019 07:38

	import re
	text = "I live in Bangladesh.\n\n\nBangladesh is a beautiful country.\n\nI love my country."
	res = re.sub(r'\n+', '\n',text)
	print(res)