Created
September 21, 2024 00:16
-
-
Save partrita/c0f713018fd2e652cec453c92ed19621 to your computer and use it in GitHub Desktop.
Python scripts
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import glob | |
import csv | |
from typing import List, Dict | |
from pathlib import Path | |
def make_file_list(extension: str) -> List[str]: | |
return glob.glob(f"*.{extension}") | |
def parse_chemical_formula(formula: str) -> Dict[str, str]: | |
segments = re.findall(r'[A-Z][a-z]*[0-9]*', formula) | |
return { | |
re.findall(r'[A-Z][a-z]*', segment)[0]: re.findall(r'[0-9]+', segment)[0] if re.findall(r'[0-9]+', segment) else '1' | |
for segment in segments | |
} | |
def process_csv_file(input_file: str, chemical_elements: List[str]): | |
output_file = Path(input_file).with_suffix('.CHNOSPSi.csv') | |
with open(input_file, 'r', newline='') as infile, open(output_file, 'w', newline='') as outfile: | |
reader = csv.reader(infile) | |
writer = csv.writer(outfile) | |
for row in reader: | |
if row[3] == 'Formula': | |
writer.writerow(row + chemical_elements) | |
else: | |
formula_dict = parse_chemical_formula(row[3]) | |
new_row = row + [formula_dict.get(element, '0') for element in chemical_elements] | |
writer.writerow(new_row) | |
def main(): | |
chemical_elements = ['C', 'H', 'N', 'O', 'S', 'P', 'Si'] | |
csv_files = make_file_list('csv') | |
for file in csv_files: | |
process_csv_file(file, chemical_elements) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request | |
import xml.etree.ElementTree as ET | |
import os | |
import gzip | |
import shutil | |
from typing import List, Optional | |
def fetch_pdb_list(sequence: str, e_cutoff: str, resolution: str) -> List[str]: | |
url: str = 'http://www.rcsb.org/pdb/rest/search' | |
query_xml: str = f""" | |
<?xml version="1.0" encoding="UTF-8"?> | |
<orgPdbCompositeQuery version="1.0"> | |
<queryRefinement> | |
<queryRefinementLevel>0</queryRefinementLevel> | |
<orgPdbQuery> | |
<queryType>org.pdb.query.simple.SequenceQuery</queryType> | |
<sequence>{sequence}</sequence> | |
<eCutOff>{e_cutoff}</eCutOff> | |
<searchTool>blast</searchTool> | |
</orgPdbQuery> | |
</queryRefinement> | |
<queryRefinement> | |
<queryRefinementLevel>1</queryRefinementLevel> | |
<orgPdbQuery> | |
<queryType>org.pdb.query.simple.ResolutionQuery</queryType> | |
<description>ResolutionQuery </description> | |
<refine.ls_d_res_high.comparator>between</refine.ls_d_res_high.comparator> | |
<refine.ls_d_res_high.max>{resolution}</refine.ls_d_res_high.max> | |
</orgPdbQuery> | |
</queryRefinement> | |
<queryRefinement> | |
<queryRefinementLevel>2</queryRefinementLevel> | |
<orgPdbQuery> | |
<queryType>org.pdb.query.simple.StructTitleQuery</queryType> | |
<struct.title.comparator>!contains</struct.title.comparator> | |
<struct.title.value>arp</struct.title.value> | |
</orgPdbQuery> | |
</queryRefinement> | |
</orgPdbCompositeQuery> | |
""" | |
print("Querying PDB...\n") | |
req: urllib.request.Request = urllib.request.Request(url, data=query_xml.encode()) | |
with urllib.request.urlopen(req) as response: | |
result: str = response.read().decode() | |
return result.split() if result else [] | |
def download_pdb_files(pdb_list: List[str], biological_unit: bool = True) -> None: | |
pdb_url: str = "ftp://ftp.wwpdb.org/pub/pdb/data/biounit/coordinates/all/{0}.pdb1.gz" if biological_unit else "http://www.rcsb.org/pdb/files/{0}.pdb.gz" | |
for pdb in pdb_list: | |
print(f"Downloading {pdb}") | |
url: str = pdb_url.format(pdb.lower()) | |
filename: str = f"{pdb}.pdb.gz" | |
urllib.request.urlretrieve(url, filename) | |
with gzip.open(filename, 'rb') as f_in: | |
with open(f"{pdb}.pdb", 'wb') as f_out: | |
shutil.copyfileobj(f_in, f_out) | |
os.remove(filename) | |
def generate_pymol_script(pdb_list: List[str]) -> None: | |
with open('load.pml', 'w') as f: | |
for pdb in pdb_list: | |
f.write(f"load {os.getcwd()}/{pdb}.pdb\n") | |
f.write("hide all\n") | |
f.write("show cartoon\n") | |
for i in range(1, len(pdb_list)): | |
f.write(f"cealign {pdb_list[0]}, {pdb_list[i]}\n") | |
def fetch_pdb_info(pdb_list: List[str]) -> None: | |
report_url: str = "http://www.rcsb.org/pdb/rest/customReport?pdbids={0}&customReportColumns=structureId,structureTitle,resolution,experimentalTechnique,depositionDate,structureAuthor,classification,structureMolecularWeight&service=wsdisplay&format=xml" | |
req: urllib.request.Request = urllib.request.Request(report_url.format(",".join(pdb_list))) | |
with urllib.request.urlopen(req) as response: | |
result: str = response.read().decode() | |
if result: | |
root: ET.Element = ET.fromstring(result) | |
with open('list.txt', 'w') as f: | |
for record in root.iter('record'): | |
record_line: List[Optional[str]] = [field.text for field in record] | |
f.write("\t".join(filter(None, record_line)) + "\n") | |
else: | |
print("Failed to retrieve PDB info") | |
def main() -> None: | |
sequence: str = """ | |
MSMILKEIRMNNFKSHVNSRIKFEKGIVAIIGENGSGKSSIFEAVFFALFGAGSNFNYDT | |
IITKGKKSVYVELDFEVNGNNYKIIREYDSGRGGAKLYKNGKPYATTISAVNKAVNEILG | |
VDRNMFLNSIYIKQGEIAKFLSLKPSEKLETVAKLLG | |
""" | |
resolution: str = "3" | |
e_cutoff: str = "1e-30" | |
pdb_list: List[str] = fetch_pdb_list(sequence, e_cutoff, resolution) | |
if pdb_list: | |
download_pdb_files(pdb_list) | |
generate_pymol_script(pdb_list) | |
fetch_pdb_info(pdb_list) | |
else: | |
print("No PDB structures found") | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import time | |
import argparse | |
from tqdm import tqdm | |
def parse_arguments() -> argparse.Namespace: | |
"""Parse command-line arguments.""" | |
parser = argparse.ArgumentParser(description='Set time of timer. It should be in minutes.') | |
parser.add_argument('-t', '--time', type=int, default=25, help='Set pomodoro interval in minutes.') | |
parser.add_argument('-b', '--break_time', type=int, default=5, help='Set break interval in minutes.') | |
parser.add_argument('-r', '--repeat', type=int, default=3, help='Set number of pomodoro rounds.') | |
return parser.parse_args() | |
def run_pomodoro(pomodoro_time: int, break_time: int, rounds: int) -> None: | |
"""Run the Pomodoro timer.""" | |
for round_number in range(1, rounds + 1): | |
print(f'Starting Pomodoro {round_number}. May the focus be with you.') | |
countdown(pomodoro_time, "Working") | |
if round_number < rounds: | |
print('Take a break!') | |
countdown(break_time, "Break") | |
print('Pomodoro timer is over. Great job!') | |
def countdown(duration: int, label: str) -> None: | |
"""Display a countdown timer.""" | |
for _ in tqdm(range(duration), desc=f"{label} Time"): | |
time.sleep(1) | |
def main() -> None: | |
args = parse_arguments() | |
pomodoro_time = args.time * 60 # Convert minutes to seconds | |
break_time = args.break_time * 60 # Convert minutes to seconds | |
rounds = args.repeat | |
run_pomodoro(pomodoro_time, break_time, rounds) | |
if __name__ == "__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import re | |
import chardet | |
from typing import List, Optional | |
class SMIConverter: | |
def __init__(self): | |
self.srt_list: List[SMIItem] = [] | |
@staticmethod | |
def usage(msg: Optional[str] = None, exit_code: int = 1) -> None: | |
print_msg = f""" | |
usage {os.path.basename(sys.argv[0])} smifile.smi [...] | |
convert smi into srt subtitle file with same filename. | |
By MoonChang Chae <[email protected]> | |
""" | |
if msg: | |
print_msg += f'{msg}\n' | |
print(print_msg) | |
sys.exit(exit_code) | |
@staticmethod | |
def ms2ts(ms: int) -> str: | |
hours, ms = divmod(ms, 3600000) | |
minutes, ms = divmod(ms, 60000) | |
seconds, ms = divmod(ms, 1000) | |
return f'{hours:02d}:{minutes:02d}:{seconds:02d},{ms:03d}' | |
def convert_smi(self, smi_file: str) -> bool: | |
if not os.path.exists(smi_file): | |
sys.stderr.write(f'Cannot find smi file <{smi_file}>\n') | |
return False | |
srt_file = f'{os.path.splitext(smi_file)[0]}.srt' | |
with open(smi_file, 'rb') as ifp: | |
smi_sgml: bytes = ifp.read() | |
chdt = chardet.detect(smi_sgml) | |
if chdt['encoding'] != 'UTF-8': | |
smi_sgml = smi_sgml.decode(chdt['encoding']).encode('utf-8') | |
fndx = smi_sgml.find(b'<SYNC') | |
if fndx < 0: | |
return False | |
smi_sgml = smi_sgml[fndx:] | |
lines: List[str] = smi_sgml.decode('utf-8').split('\n') | |
self._parse_smi_lines(lines) | |
self._write_srt_file(srt_file) | |
return True | |
def _parse_smi_lines(self, lines: List[str]) -> None: | |
sync_cont: str = '' | |
si: Optional[SMIItem] = None | |
last_si: Optional[SMIItem] = None | |
for linecnt, line in enumerate(lines, 1): | |
sndx = line.upper().find('<SYNC') | |
if sndx >= 0: | |
m = re.search(r'<sync\s+start\s*=\s*(\d+)>(.*)$', line, flags=re.IGNORECASE) | |
if not m: | |
raise ValueError(f'Invalid format tag of <Sync start=nnnn> with "{line}"') | |
sync_cont += line[:sndx] | |
last_si = si | |
if last_si: | |
last_si.end_ms = int(m.group(1)) | |
last_si.contents = sync_cont | |
self.srt_list.append(last_si) | |
last_si.linecount = linecnt | |
sync_cont = m.group(2) | |
si = SMIItem(start_ms=int(m.group(1))) | |
else: | |
sync_cont += line | |
def _write_srt_file(self, srt_file: str) -> None: | |
with open(srt_file, 'w', encoding='utf-8') as ofp: | |
for ndx, si in enumerate(self.srt_list, 1): | |
si.convert_srt() | |
if not si.contents: | |
continue | |
ofp.write(f'{ndx}\n{si.start_ts} --> {si.end_ts}\n{si.contents}\n\n') | |
class SMIItem: | |
def __init__(self, start_ms: int = 0): | |
self.start_ms: int = start_ms | |
self.start_ts: str = '00:00:00,000' | |
self.end_ms: int = 0 | |
self.end_ts: str = '00:00:00,000' | |
self.contents: Optional[str] = None | |
self.linecount: int = 0 | |
def convert_srt(self) -> None: | |
self.start_ts = SMIConverter.ms2ts(self.start_ms) | |
self.end_ts = SMIConverter.ms2ts(self.end_ms - 10) | |
if self.contents: | |
self.contents = re.sub(r'\s+', ' ', self.contents) | |
self.contents = re.sub(r'&[a-z]{2,5};', '', self.contents) | |
self.contents = re.sub(r'(<br>)+', '\n', self.contents, flags=re.IGNORECASE) | |
self._process_tags() | |
self.contents = self.contents.strip() | |
def _process_tags(self) -> None: | |
if self.contents: | |
fndx = self.contents.find('<') | |
if fndx >= 0: | |
contents = self.contents[fndx:] | |
sb = self.contents[:fndx] | |
while True: | |
m = re.match(r'</?([a-z]+)[^>]*>([^<>]*)', contents, flags=re.IGNORECASE) | |
if not m: | |
break | |
contents = contents[m.end(2):] | |
if m.group(1).lower() in ['b', 'i', 'u']: | |
sb += m.string[:m.start(2)] | |
sb += m.group(2) | |
self.contents = sb | |
def __repr__(self) -> str: | |
return f'{self.start_ms}:{self.end_ms}:<{self.contents}>:{self.linecount}' | |
def main() -> None: | |
if len(sys.argv) <= 1: | |
SMIConverter.usage() | |
converter = SMIConverter() | |
for smi_file in sys.argv[1:]: | |
if converter.convert_smi(smi_file): | |
print(f"Converting <{smi_file}> OK!") | |
else: | |
print(f"Converting <{smi_file}> Failure!") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment