Skip to content

Instantly share code, notes, and snippets.

@partrita
Created September 21, 2024 00:16
Show Gist options
  • Save partrita/c0f713018fd2e652cec453c92ed19621 to your computer and use it in GitHub Desktop.
Save partrita/c0f713018fd2e652cec453c92ed19621 to your computer and use it in GitHub Desktop.
Python scripts
import re
import glob
import csv
from typing import List, Dict
from pathlib import Path
def make_file_list(extension: str) -> List[str]:
return glob.glob(f"*.{extension}")
def parse_chemical_formula(formula: str) -> Dict[str, str]:
segments = re.findall(r'[A-Z][a-z]*[0-9]*', formula)
return {
re.findall(r'[A-Z][a-z]*', segment)[0]: re.findall(r'[0-9]+', segment)[0] if re.findall(r'[0-9]+', segment) else '1'
for segment in segments
}
def process_csv_file(input_file: str, chemical_elements: List[str]):
output_file = Path(input_file).with_suffix('.CHNOSPSi.csv')
with open(input_file, 'r', newline='') as infile, open(output_file, 'w', newline='') as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
for row in reader:
if row[3] == 'Formula':
writer.writerow(row + chemical_elements)
else:
formula_dict = parse_chemical_formula(row[3])
new_row = row + [formula_dict.get(element, '0') for element in chemical_elements]
writer.writerow(new_row)
def main():
chemical_elements = ['C', 'H', 'N', 'O', 'S', 'P', 'Si']
csv_files = make_file_list('csv')
for file in csv_files:
process_csv_file(file, chemical_elements)
if __name__ == '__main__':
main()
import urllib.request
import xml.etree.ElementTree as ET
import os
import gzip
import shutil
from typing import List, Optional
def fetch_pdb_list(sequence: str, e_cutoff: str, resolution: str) -> List[str]:
url: str = 'http://www.rcsb.org/pdb/rest/search'
query_xml: str = f"""
<?xml version="1.0" encoding="UTF-8"?>
<orgPdbCompositeQuery version="1.0">
<queryRefinement>
<queryRefinementLevel>0</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.SequenceQuery</queryType>
<sequence>{sequence}</sequence>
<eCutOff>{e_cutoff}</eCutOff>
<searchTool>blast</searchTool>
</orgPdbQuery>
</queryRefinement>
<queryRefinement>
<queryRefinementLevel>1</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.ResolutionQuery</queryType>
<description>ResolutionQuery </description>
<refine.ls_d_res_high.comparator>between</refine.ls_d_res_high.comparator>
<refine.ls_d_res_high.max>{resolution}</refine.ls_d_res_high.max>
</orgPdbQuery>
</queryRefinement>
<queryRefinement>
<queryRefinementLevel>2</queryRefinementLevel>
<orgPdbQuery>
<queryType>org.pdb.query.simple.StructTitleQuery</queryType>
<struct.title.comparator>!contains</struct.title.comparator>
<struct.title.value>arp</struct.title.value>
</orgPdbQuery>
</queryRefinement>
</orgPdbCompositeQuery>
"""
print("Querying PDB...\n")
req: urllib.request.Request = urllib.request.Request(url, data=query_xml.encode())
with urllib.request.urlopen(req) as response:
result: str = response.read().decode()
return result.split() if result else []
def download_pdb_files(pdb_list: List[str], biological_unit: bool = True) -> None:
pdb_url: str = "ftp://ftp.wwpdb.org/pub/pdb/data/biounit/coordinates/all/{0}.pdb1.gz" if biological_unit else "http://www.rcsb.org/pdb/files/{0}.pdb.gz"
for pdb in pdb_list:
print(f"Downloading {pdb}")
url: str = pdb_url.format(pdb.lower())
filename: str = f"{pdb}.pdb.gz"
urllib.request.urlretrieve(url, filename)
with gzip.open(filename, 'rb') as f_in:
with open(f"{pdb}.pdb", 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(filename)
def generate_pymol_script(pdb_list: List[str]) -> None:
with open('load.pml', 'w') as f:
for pdb in pdb_list:
f.write(f"load {os.getcwd()}/{pdb}.pdb\n")
f.write("hide all\n")
f.write("show cartoon\n")
for i in range(1, len(pdb_list)):
f.write(f"cealign {pdb_list[0]}, {pdb_list[i]}\n")
def fetch_pdb_info(pdb_list: List[str]) -> None:
report_url: str = "http://www.rcsb.org/pdb/rest/customReport?pdbids={0}&customReportColumns=structureId,structureTitle,resolution,experimentalTechnique,depositionDate,structureAuthor,classification,structureMolecularWeight&service=wsdisplay&format=xml"
req: urllib.request.Request = urllib.request.Request(report_url.format(",".join(pdb_list)))
with urllib.request.urlopen(req) as response:
result: str = response.read().decode()
if result:
root: ET.Element = ET.fromstring(result)
with open('list.txt', 'w') as f:
for record in root.iter('record'):
record_line: List[Optional[str]] = [field.text for field in record]
f.write("\t".join(filter(None, record_line)) + "\n")
else:
print("Failed to retrieve PDB info")
def main() -> None:
sequence: str = """
MSMILKEIRMNNFKSHVNSRIKFEKGIVAIIGENGSGKSSIFEAVFFALFGAGSNFNYDT
IITKGKKSVYVELDFEVNGNNYKIIREYDSGRGGAKLYKNGKPYATTISAVNKAVNEILG
VDRNMFLNSIYIKQGEIAKFLSLKPSEKLETVAKLLG
"""
resolution: str = "3"
e_cutoff: str = "1e-30"
pdb_list: List[str] = fetch_pdb_list(sequence, e_cutoff, resolution)
if pdb_list:
download_pdb_files(pdb_list)
generate_pymol_script(pdb_list)
fetch_pdb_info(pdb_list)
else:
print("No PDB structures found")
if __name__ == "__main__":
main()
import sys
import time
import argparse
from tqdm import tqdm
def parse_arguments() -> argparse.Namespace:
"""Parse command-line arguments."""
parser = argparse.ArgumentParser(description='Set time of timer. It should be in minutes.')
parser.add_argument('-t', '--time', type=int, default=25, help='Set pomodoro interval in minutes.')
parser.add_argument('-b', '--break_time', type=int, default=5, help='Set break interval in minutes.')
parser.add_argument('-r', '--repeat', type=int, default=3, help='Set number of pomodoro rounds.')
return parser.parse_args()
def run_pomodoro(pomodoro_time: int, break_time: int, rounds: int) -> None:
"""Run the Pomodoro timer."""
for round_number in range(1, rounds + 1):
print(f'Starting Pomodoro {round_number}. May the focus be with you.')
countdown(pomodoro_time, "Working")
if round_number < rounds:
print('Take a break!')
countdown(break_time, "Break")
print('Pomodoro timer is over. Great job!')
def countdown(duration: int, label: str) -> None:
"""Display a countdown timer."""
for _ in tqdm(range(duration), desc=f"{label} Time"):
time.sleep(1)
def main() -> None:
args = parse_arguments()
pomodoro_time = args.time * 60 # Convert minutes to seconds
break_time = args.break_time * 60 # Convert minutes to seconds
rounds = args.repeat
run_pomodoro(pomodoro_time, break_time, rounds)
if __name__ == "__main__":
main()
import os
import sys
import re
import chardet
from typing import List, Optional
class SMIConverter:
def __init__(self):
self.srt_list: List[SMIItem] = []
@staticmethod
def usage(msg: Optional[str] = None, exit_code: int = 1) -> None:
print_msg = f"""
usage {os.path.basename(sys.argv[0])} smifile.smi [...]
convert smi into srt subtitle file with same filename.
By MoonChang Chae <[email protected]>
"""
if msg:
print_msg += f'{msg}\n'
print(print_msg)
sys.exit(exit_code)
@staticmethod
def ms2ts(ms: int) -> str:
hours, ms = divmod(ms, 3600000)
minutes, ms = divmod(ms, 60000)
seconds, ms = divmod(ms, 1000)
return f'{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}'
def convert_smi(self, smi_file: str) -> bool:
if not os.path.exists(smi_file):
sys.stderr.write(f'Cannot find smi file <{smi_file}>\n')
return False
srt_file = f'{os.path.splitext(smi_file)[0]}.srt'
with open(smi_file, 'rb') as ifp:
smi_sgml: bytes = ifp.read()
chdt = chardet.detect(smi_sgml)
if chdt['encoding'] != 'UTF-8':
smi_sgml = smi_sgml.decode(chdt['encoding']).encode('utf-8')
fndx = smi_sgml.find(b'<SYNC')
if fndx < 0:
return False
smi_sgml = smi_sgml[fndx:]
lines: List[str] = smi_sgml.decode('utf-8').split('\n')
self._parse_smi_lines(lines)
self._write_srt_file(srt_file)
return True
def _parse_smi_lines(self, lines: List[str]) -> None:
sync_cont: str = ''
si: Optional[SMIItem] = None
last_si: Optional[SMIItem] = None
for linecnt, line in enumerate(lines, 1):
sndx = line.upper().find('<SYNC')
if sndx >= 0:
m = re.search(r'<sync\s+start\s*=\s*(\d+)>(.*)$', line, flags=re.IGNORECASE)
if not m:
raise ValueError(f'Invalid format tag of <Sync start=nnnn> with "{line}"')
sync_cont += line[:sndx]
last_si = si
if last_si:
last_si.end_ms = int(m.group(1))
last_si.contents = sync_cont
self.srt_list.append(last_si)
last_si.linecount = linecnt
sync_cont = m.group(2)
si = SMIItem(start_ms=int(m.group(1)))
else:
sync_cont += line
def _write_srt_file(self, srt_file: str) -> None:
with open(srt_file, 'w', encoding='utf-8') as ofp:
for ndx, si in enumerate(self.srt_list, 1):
si.convert_srt()
if not si.contents:
continue
ofp.write(f'{ndx}\n{si.start_ts} --> {si.end_ts}\n{si.contents}\n\n')
class SMIItem:
def __init__(self, start_ms: int = 0):
self.start_ms: int = start_ms
self.start_ts: str = '00:00:00,000'
self.end_ms: int = 0
self.end_ts: str = '00:00:00,000'
self.contents: Optional[str] = None
self.linecount: int = 0
def convert_srt(self) -> None:
self.start_ts = SMIConverter.ms2ts(self.start_ms)
self.end_ts = SMIConverter.ms2ts(self.end_ms - 10)
if self.contents:
self.contents = re.sub(r'\s+', ' ', self.contents)
self.contents = re.sub(r'&[a-z]{2,5};', '', self.contents)
self.contents = re.sub(r'(<br>)+', '\n', self.contents, flags=re.IGNORECASE)
self._process_tags()
self.contents = self.contents.strip()
def _process_tags(self) -> None:
if self.contents:
fndx = self.contents.find('<')
if fndx >= 0:
contents = self.contents[fndx:]
sb = self.contents[:fndx]
while True:
m = re.match(r'</?([a-z]+)[^>]*>([^<>]*)', contents, flags=re.IGNORECASE)
if not m:
break
contents = contents[m.end(2):]
if m.group(1).lower() in ['b', 'i', 'u']:
sb += m.string[:m.start(2)]
sb += m.group(2)
self.contents = sb
def __repr__(self) -> str:
return f'{self.start_ms}:{self.end_ms}:<{self.contents}>:{self.linecount}'
def main() -> None:
if len(sys.argv) <= 1:
SMIConverter.usage()
converter = SMIConverter()
for smi_file in sys.argv[1:]:
if converter.convert_smi(smi_file):
print(f"Converting <{smi_file}> OK!")
else:
print(f"Converting <{smi_file}> Failure!")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment