Last active
May 23, 2017 12:35
-
-
Save geocarvalho/ea1cc42b378a943c603bf418c13b6e5d to your computer and use it in GitHub Desktop.
list_from_bed.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import numpy as np | |
import os | |
import sys | |
bed_file = sys.argv[1] | |
name = bed_file.split('.')[0] | |
#import bed into dataframe | |
header = ['Chr', 'Start', 'End', 'Gene'] | |
bed_df = pd.read_csv(bed_file, sep='\t', header=0, names=header, \ | |
usecols=[0,1,2,3]) | |
bed_df['Gene'] = bed_df['Gene'].apply(lambda x: x.split('.')[0]) | |
list_df = pd.DataFrame(columns=['Chr', 'Start', 'End']) | |
#print bed_df['Gene'].unique() | |
f = open('%s.list'%name, 'w') | |
#print bed_df['Gene'].unique() | |
for gene in bed_df['Gene'].unique(): | |
#print gene | |
temp_df = bed_df[bed_df['Gene'] == gene] | |
min_position = temp_df['Start'].min() | |
max_position = temp_df['End'].max() | |
if len(temp_df['Chr'].unique()) == 1: | |
chromossome = temp_df['Chr'].iloc[0] | |
else: | |
print "Gene includes more than one chromossome, check the bed_file" | |
break | |
# new_inputs = [{'Chr' : chromossome, 'Start': min_position, 'End': max_position}] | |
# new_df = pd.DataFrame(new_inputs) | |
# list_df = list_df.append(new_df) | |
string_to_write = str(chromossome) + ":" + str(min_position) + "-" \ | |
+ str(max_position) + "\n" | |
f.write(string_to_write) | |
print string_to_write |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment