Skip to content

Instantly share code, notes, and snippets.

@jseabold
Created September 21, 2013 18:47
Show Gist options
  • Select an option

  • Save jseabold/6653108 to your computer and use it in GitHub Desktop.

Select an option

Save jseabold/6653108 to your computer and use it in GitHub Desktop.
Go from wide format panel data to long format. Similar to Stata's reshape.
def wide_to_long(df, stubnames, i, j):
"""
User-friendly wide panel to long format.
Parameters
----------
df : DataFrame
The wide-format DataFrame
stubnames : list
A list of stub names. The wide format variables are assumed to
start with the stub names.
i : str
The name of the id variable.
j : str
The name of the subobservation variable.
Returns
-------
DataFrame
A DataFrame that contains each stub name as a variable as well as
variables for i and j.
Examples
--------
import pandas as pd
import numpy as np
np.random.seed(123)
df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
"A1980" : {0 : "d", 1 : "e", 2 : "f"},
"B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
"B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
"X" : dict(zip(range(3), np.random.randn(3)))
})
df["id"] = df.index
wide_to_long(df, ["A", "B"], i="id", j="year")
Notes
-----
All extra variables are treated as extra id variables.
"""
def get_var_names(df, regex):
return df.filter(regex=regex).columns.tolist()
def melt_stub(df, stub, i, j):
varnames = get_var_names(df, "^"+stub)
newdf = pd.melt(df, id_vars=i, value_vars=varnames,
value_name=stub, var_name=j)
newdf[j] = newdf[j].str.replace(stub, "").astype(int)
return newdf
id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames))
if i not in id_vars:
id_vars += [i]
stub = stubnames.pop(0)
newdf = melt_stub(df, stub, id_vars, j)
for stub in stubnames:
new = melt_stub(df, stub, id_vars, j)
newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False)
return newdf.set_index([i, j])
if __name__ == "__main__":
url = "https://gist.github.com/jseabold/6652233/raw/44a04a78be85f5b3af16f26f51a143e047a10972/patr7079.csv"
dta = pd.read_csv(url)
dta["id"] = dta.index
long_dta = wide_to_long(dta, ["LOGR", "PAT"], i="id", j="year")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment