-
-
Save jlln/338b4b0b55bd6984f883 to your computer and use it in GitHub Desktop.
def splitDataFrameList(df,target_column,separator): | |
''' df = dataframe to split, | |
target_column = the column containing the values to split | |
separator = the symbol used to perform the split | |
returns: a dataframe with each entry for the target column separated, with each element moved into a new row. | |
The values in the other columns are duplicated across the newly divided rows. | |
''' | |
def splitListToRows(row,row_accumulator,target_column,separator): | |
split_row = row[target_column].split(separator) | |
for s in split_row: | |
new_row = row.to_dict() | |
new_row[target_column] = s | |
row_accumulator.append(new_row) | |
new_rows = [] | |
df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator)) | |
new_df = pandas.DataFrame(new_rows) | |
return new_df |
Hey I made it so it can accept multiple columns and try to split on all of them at the same time
def split_dataframe_rows(df,column_selectors, row_delimiter):
# we need to keep track of the ordering of the columns
def _split_list_to_rows(row,row_accumulator,column_selector,row_delimiter):
split_rows = {}
max_split = 0
for column_selector in column_selectors:
split_row = row[column_selector].split(row_delimiter)
split_rows[column_selector] = split_row
if len(split_row) > max_split:
max_split = len(split_row)
for i in range(max_split):
new_row = row.to_dict()
for column_selector in column_selectors:
try:
new_row[column_selector] = split_rows[column_selector].pop(0)
except IndexError:
new_row[column_selector] = ''
row_accumulator.append(new_row)
new_rows = []
df.apply(_split_list_to_rows,axis=1,args = (new_rows,column_selectors,row_delimiter))
new_df = pd.DataFrame(new_rows, columns=df.columns)
return new_df
into
And here is some variation of @JoaoCarabetta's split function, that leaves additional columns as they are (no drop of columns) and sets list-columns with empty lists with None, while copying the other rows as they were.
def split_data_frame_list(df,
target_column,
output_type=float):
'''
Accepts a column with multiple types and splits list variables to several rows.
df: dataframe to split
target_column: the column containing the values to split
output_type: type of all outputs
returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
The values in the other columns are duplicated across the newly divided rows.
'''
row_accumulator = []
def split_list_to_rows(row):
split_row = row[target_column]
if isinstance(split_row, list):
for s in split_row:
new_row = row.to_dict()
new_row[target_column] = s
row_accumulator.append(new_row)
if split_row == []:
new_row = row.to_dict()
new_row[target_column] = None
row_accumulator.append(new_row)
else:
new_row = row.to_dict()
new_row[target_column] = split_row
row_accumulator.append(new_row)
df.apply(split_list_to_rows, axis=1)
new_df = pd.DataFrame(row_accumulator)
return new_df
>>> df = pd.DataFrame({'name':['a','b','c','d'], "items":[['a1','a2','a3'],['b1','b2','b3'],['c1','c2','c3'],[]],'leave me':range(4)})
>>> df
items leave me name
0 [a1, a2, a3] 0 a
1 [b1, b2, b3] 1 b
2 [c1, c2, c3] 2 c
3 [] 3 d
>>> split_data_frame_list(df, target_column='items')
items leave me name
0 a1 0 a
1 a2 0 a
2 a3 0 a
3 b1 1 b
4 b2 1 b
5 b3 1 b
6 c1 2 c
7 c2 2 c
8 c3 2 c
9 None 3 d
helps a lot, thank you =D
@zouweilin 's extended version for lists
def split_dataframe_rows(df,column_selectors):
# we need to keep track of the ordering of the columns
def _split_list_to_rows(row,row_accumulator,column_selector):
split_rows = {}
max_split = 0
for column_selector in column_selectors:
split_row = row[column_selector]
split_rows[column_selector] = split_row
if len(split_row) > max_split:
max_split = len(split_row)
for i in range(max_split):
new_row = row.to_dict()
for column_selector in column_selectors:
try:
new_row[column_selector] = split_rows[column_selector].pop(0)
except IndexError:
new_row[column_selector] = ''
row_accumulator.append(new_row)
new_rows = []
df.apply(_split_list_to_rows,axis=1,args = (new_rows,column_selectors))
new_df = pd.DataFrame(new_rows, columns=df.columns)
return new_df
Try this too:
def flatten_data(json = None):
df = pd.DataFrame(json)
list_cols = [col for col in df.columns if type(df.loc[0, col]) == list]
for i in range(len(list_cols)):
col = list_cols[i]
meta_cols = [col for col in df.columns if type(df.loc[0, col]) != list] + list_cols[i+1:]
json_data = df.to_dict('records')
df = json_normalize(data=json_data, record_path=col, meta=meta_cols, record_prefix=col+str('_'), sep='_')
return json_normalize(df.to_dict('records'))
ENJOY..!!!
Was helpful. Thanks.
Works just fine! Thank you.
Can someone help me with the code for the below problem:
I have multiple columns with more than 1 value separated by delimiter. I need to create separate rows for those columns such that each value in the column will become a new row keeping the other values same.
I have attached the input and expected output in the excel sheet.
Literally just made a github account right now so I could say thank you.
Thank you!
EXACTLY what I was looking for and worked like a charm.
Big thanks, worked really well, much faster and cleaner than 1-st link solutions from Google.
So I'm trying to use code by when I call the function and insert my delimeter into the function (semicolon) I get an invalid syntax error. I might be doing something wrong. Sorry I'm new to programming.
splitDataFrameList(df, alert_rule, ;)
Thanks so much, using this saved me a ton of time!
Thank you , I modified it little bit to accomodate multiple delimiters by generating a regex pattern.
Like -
delimiters = ",","|"
Import re module for this
import re
def splitDataFrameList(df,target_column,delimiters):
''' df = dataframe to split,
target_column = the column containing the values to split
separator = the symbol used to perform the split
returns: a dataframe with each entry for the target column separated, with each element moved into a new row.
The values in the other columns are duplicated across the newly divided rows.
'''
regexPattern = "|".join(map(re.escape,delimiters))
def splitListToRows(row,row_accumulator,target_column,regexPattern):
split_row = re.split(regexPattern,row[target_column])
for s in split_row:
new_row = row.to_dict()
new_row[target_column] = s
row_accumulator.append(new_row)
new_rows = []
df.apply(splitListToRows,axis=1,args = (new_rows,target_column,regexPattern))
new_df = pd.DataFrame(new_rows)
return new_df
Hey I made it so it can accept multiple columns and try to split on all of them at the same time
def split_dataframe_rows(df,column_selectors, row_delimiter): # we need to keep track of the ordering of the columns def _split_list_to_rows(row,row_accumulator,column_selector,row_delimiter): split_rows = {} max_split = 0 for column_selector in column_selectors: split_row = row[column_selector].split(row_delimiter) split_rows[column_selector] = split_row if len(split_row) > max_split: max_split = len(split_row) for i in range(max_split): new_row = row.to_dict() for column_selector in column_selectors: try: new_row[column_selector] = split_rows[column_selector].pop(0) except IndexError: new_row[column_selector] = '' row_accumulator.append(new_row) new_rows = [] df.apply(_split_list_to_rows,axis=1,args = (new_rows,column_selectors,row_delimiter)) new_df = pd.DataFrame(new_rows, columns=df.columns) return new_df
into
Thanks bro .. worked like a charm..
Worked wonders, Thank you so much :D
Thank you , I modified it little bit to accomodate multiple delimiters by generating a regex pattern.
Like -
delimiters = ",","|"
Import re module for this
import re
def splitDataFrameList(df,target_column,delimiters): ''' df = dataframe to split, target_column = the column containing the values to split separator = the symbol used to perform the split returns: a dataframe with each entry for the target column separated, with each element moved into a new row. The values in the other columns are duplicated across the newly divided rows. ''' regexPattern = "|".join(map(re.escape,delimiters)) def splitListToRows(row,row_accumulator,target_column,regexPattern): split_row = re.split(regexPattern,row[target_column]) for s in split_row: new_row = row.to_dict() new_row[target_column] = s row_accumulator.append(new_row) new_rows = [] df.apply(splitListToRows,axis=1,args = (new_rows,target_column,regexPattern)) new_df = pd.DataFrame(new_rows) return new_df
I have a dataframe that contains fieldnames and field content. There is no specific delimiter, but the fieldnames are limitative.
Have a somewhat clumsy way
https://stackoverflow.com/questions/60773067/split-string-no-delimiter-with-limitative-field-names-and-content
But this is far from optimal. Any advice?
What a help! very handy, saved time and worked like a magic!! Thank you!
Exactly what I was looking for.
Hey I made it so it can accept multiple columns and try to split on all of them at the same time
def split_dataframe_rows(df,column_selectors, row_delimiter): # we need to keep track of the ordering of the columns def _split_list_to_rows(row,row_accumulator,column_selector,row_delimiter): split_rows = {} max_split = 0 for column_selector in column_selectors: split_row = row[column_selector].split(row_delimiter) split_rows[column_selector] = split_row if len(split_row) > max_split: max_split = len(split_row) for i in range(max_split): new_row = row.to_dict() for column_selector in column_selectors: try: new_row[column_selector] = split_rows[column_selector].pop(0) except IndexError: new_row[column_selector] = '' row_accumulator.append(new_row) new_rows = [] df.apply(_split_list_to_rows,axis=1,args = (new_rows,column_selectors,row_delimiter)) new_df = pd.DataFrame(new_rows, columns=df.columns) return new_df
into
Works great, exactly what I was looking for. Thanks!
Very useful. Thanks
Thanks A lot For this code
Thanks! Very useful!
You could use 'pd.' instead of 'pandas.' :)
From pandas 0.25 on, one can use explode
Thank you !
very helpful!
Need some help. I am unable to run script provided by @namanjh and others above. I am using the same input data file separated by commas. Script errors due to new_df not being defined. Please help. Thanks.
Input data:
Contact | email2 | phone | notes | |
---|---|---|---|---|
adam,bob | adam.con, bob.com, john.com | adam.com2, bob.com2, john.com2 | adamphone, bobphone, johnphone | should be same for everyone |
other contact | don’t touch this | asdf | asdf | don’t touch this |
rachael, simone, snake | rachael.com, simone.com | rachael.com2, simone.com2, snake.com2 | rachaelphone, simonephone | should be same for everyone |
other contact | don’t touch this | asdf | asdf | don’t touch this |
Script:
#import numpy as np
import pandas as pd
from IPython.display import display
df = pd.DataFrame(pd.read_excel("file_path.xlsx"))
column_selectors = list(df)
row_delimiters = ','
#new_df = []
display(df)
display(column_selectors)
def split_dataframe_rows(df, column_selectors, row_delimiters):
def _split_list_to_rows(row, row_accumulator, column_selector, row_delimiter):
split_rows = {}
max_split = 0
for column_selector in column_selectors:
split_row = row[column_selector].split(row_delimiter)
split_rows[column_selector] = split_row
if len(split_row) > max_split:
max_split = len(split_row)
for i in range(max_split):
new_row = row.to_dict()
for column_selector in column_selectors:
try:
new_row[column_selector] = split_rows[column_selector].pop(0)
except IndexError:
new_row[column_selector] = ''
row_accumulator.append(new_row)
new_rows = []
df.apply(_split_list_to_rows, axis=1, args = (new_rows, column_selectors, row_delimiter))
new_df = pd.DataFrame(new_rows, column=df.columns)
return new_df
df2 = new_df
display(df2)
Super excited, this is my first time commenting with something I made, be gentle. I needed something similar when a number and a string were in one column separated by a comma (blanks as well). I modified my code a bit to hopefully work a little more universally.
The code checks how many times a delimiter is used in each column row, then repeats that line for each one.
import pandas as pd
from itertools import chain
import numpy as np
def chainer(df,col,sep,dtype):
df = df.astype({col:dtype})
lens = df[col].str.split(sep).map(len)
dicts = {}
for cols in df.columns:
if cols == col:
dicts[cols] = list(chain.from_iterable(df[cols].str.split(sep)))
else:
dicts[cols] = np.repeat(df[cols],lens)
return pd.DataFrame.from_dict(dicts)
df = chainer(df,'Combined Column',',','str')
Added the astype because my column wouldn't convert a float or NaN, after using str worked like a champ.
This variation might be a bit faster.
Example: