-
-
Save jlln/338b4b0b55bd6984f883 to your computer and use it in GitHub Desktop.
def splitDataFrameList(df,target_column,separator): | |
''' df = dataframe to split, | |
target_column = the column containing the values to split | |
separator = the symbol used to perform the split | |
returns: a dataframe with each entry for the target column separated, with each element moved into a new row. | |
The values in the other columns are duplicated across the newly divided rows. | |
''' | |
def splitListToRows(row,row_accumulator,target_column,separator): | |
split_row = row[target_column].split(separator) | |
for s in split_row: | |
new_row = row.to_dict() | |
new_row[target_column] = s | |
row_accumulator.append(new_row) | |
new_rows = [] | |
df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator)) | |
new_df = pandas.DataFrame(new_rows) | |
return new_df |
Hey I made it so it can accept multiple columns and try to split on all of them at the same time
def split_dataframe_rows(df,column_selectors, row_delimiter): # we need to keep track of the ordering of the columns def _split_list_to_rows(row,row_accumulator,column_selector,row_delimiter): split_rows = {} max_split = 0 for column_selector in column_selectors: split_row = row[column_selector].split(row_delimiter) split_rows[column_selector] = split_row if len(split_row) > max_split: max_split = len(split_row) for i in range(max_split): new_row = row.to_dict() for column_selector in column_selectors: try: new_row[column_selector] = split_rows[column_selector].pop(0) except IndexError: new_row[column_selector] = '' row_accumulator.append(new_row) new_rows = [] df.apply(_split_list_to_rows,axis=1,args = (new_rows,column_selectors,row_delimiter)) new_df = pd.DataFrame(new_rows, columns=df.columns) return new_df
into
Thanks bro .. worked like a charm..
Worked wonders, Thank you so much :D
Thank you , I modified it little bit to accomodate multiple delimiters by generating a regex pattern.
Like -
delimiters = ",","|"
Import re module for this
import re
def splitDataFrameList(df,target_column,delimiters): ''' df = dataframe to split, target_column = the column containing the values to split separator = the symbol used to perform the split returns: a dataframe with each entry for the target column separated, with each element moved into a new row. The values in the other columns are duplicated across the newly divided rows. ''' regexPattern = "|".join(map(re.escape,delimiters)) def splitListToRows(row,row_accumulator,target_column,regexPattern): split_row = re.split(regexPattern,row[target_column]) for s in split_row: new_row = row.to_dict() new_row[target_column] = s row_accumulator.append(new_row) new_rows = [] df.apply(splitListToRows,axis=1,args = (new_rows,target_column,regexPattern)) new_df = pd.DataFrame(new_rows) return new_df
I have a dataframe that contains fieldnames and field content. There is no specific delimiter, but the fieldnames are limitative.
Have a somewhat clumsy way
https://stackoverflow.com/questions/60773067/split-string-no-delimiter-with-limitative-field-names-and-content
But this is far from optimal. Any advice?
What a help! very handy, saved time and worked like a magic!! Thank you!
Exactly what I was looking for.
Hey I made it so it can accept multiple columns and try to split on all of them at the same time
def split_dataframe_rows(df,column_selectors, row_delimiter): # we need to keep track of the ordering of the columns def _split_list_to_rows(row,row_accumulator,column_selector,row_delimiter): split_rows = {} max_split = 0 for column_selector in column_selectors: split_row = row[column_selector].split(row_delimiter) split_rows[column_selector] = split_row if len(split_row) > max_split: max_split = len(split_row) for i in range(max_split): new_row = row.to_dict() for column_selector in column_selectors: try: new_row[column_selector] = split_rows[column_selector].pop(0) except IndexError: new_row[column_selector] = '' row_accumulator.append(new_row) new_rows = [] df.apply(_split_list_to_rows,axis=1,args = (new_rows,column_selectors,row_delimiter)) new_df = pd.DataFrame(new_rows, columns=df.columns) return new_df
into
Works great, exactly what I was looking for. Thanks!
Very useful. Thanks
Thanks A lot For this code
Thanks! Very useful!
You could use 'pd.' instead of 'pandas.' :)
From pandas 0.25 on, one can use explode
Thank you !
very helpful!
Need some help. I am unable to run script provided by @namanjh and others above. I am using the same input data file separated by commas. Script errors due to new_df not being defined. Please help. Thanks.
Input data:
Contact | email2 | phone | notes | |
---|---|---|---|---|
adam,bob | adam.con, bob.com, john.com | adam.com2, bob.com2, john.com2 | adamphone, bobphone, johnphone | should be same for everyone |
other contact | don’t touch this | asdf | asdf | don’t touch this |
rachael, simone, snake | rachael.com, simone.com | rachael.com2, simone.com2, snake.com2 | rachaelphone, simonephone | should be same for everyone |
other contact | don’t touch this | asdf | asdf | don’t touch this |
Script:
#import numpy as np
import pandas as pd
from IPython.display import display
df = pd.DataFrame(pd.read_excel("file_path.xlsx"))
column_selectors = list(df)
row_delimiters = ','
#new_df = []
display(df)
display(column_selectors)
def split_dataframe_rows(df, column_selectors, row_delimiters):
def _split_list_to_rows(row, row_accumulator, column_selector, row_delimiter):
split_rows = {}
max_split = 0
for column_selector in column_selectors:
split_row = row[column_selector].split(row_delimiter)
split_rows[column_selector] = split_row
if len(split_row) > max_split:
max_split = len(split_row)
for i in range(max_split):
new_row = row.to_dict()
for column_selector in column_selectors:
try:
new_row[column_selector] = split_rows[column_selector].pop(0)
except IndexError:
new_row[column_selector] = ''
row_accumulator.append(new_row)
new_rows = []
df.apply(_split_list_to_rows, axis=1, args = (new_rows, column_selectors, row_delimiter))
new_df = pd.DataFrame(new_rows, column=df.columns)
return new_df
df2 = new_df
display(df2)
Super excited, this is my first time commenting with something I made, be gentle. I needed something similar when a number and a string were in one column separated by a comma (blanks as well). I modified my code a bit to hopefully work a little more universally.
The code checks how many times a delimiter is used in each column row, then repeats that line for each one.
import pandas as pd
from itertools import chain
import numpy as np
def chainer(df,col,sep,dtype):
df = df.astype({col:dtype})
lens = df[col].str.split(sep).map(len)
dicts = {}
for cols in df.columns:
if cols == col:
dicts[cols] = list(chain.from_iterable(df[cols].str.split(sep)))
else:
dicts[cols] = np.repeat(df[cols],lens)
return pd.DataFrame.from_dict(dicts)
df = chainer(df,'Combined Column',',','str')
Added the astype because my column wouldn't convert a float or NaN, after using str worked like a champ.
Thank you , I modified it little bit to accomodate multiple delimiters by generating a regex pattern.
Like -
delimiters = ",","|"
Import re module for this
import re