Created
September 21, 2013 18:47
-
-
Save jseabold/6653108 to your computer and use it in GitHub Desktop.
Go from wide format panel data to long format. Similar to Stata's reshape.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def wide_to_long(df, stubnames, i, j): | |
| """ | |
| User-friendly wide panel to long format. | |
| Parameters | |
| ---------- | |
| df : DataFrame | |
| The wide-format DataFrame | |
| stubnames : list | |
| A list of stub names. The wide format variables are assumed to | |
| start with the stub names. | |
| i : str | |
| The name of the id variable. | |
| j : str | |
| The name of the subobservation variable. | |
| Returns | |
| ------- | |
| DataFrame | |
| A DataFrame that contains each stub name as a variable as well as | |
| variables for i and j. | |
| Examples | |
| -------- | |
| import pandas as pd | |
| import numpy as np | |
| np.random.seed(123) | |
| df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"}, | |
| "A1980" : {0 : "d", 1 : "e", 2 : "f"}, | |
| "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7}, | |
| "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1}, | |
| "X" : dict(zip(range(3), np.random.randn(3))) | |
| }) | |
| df["id"] = df.index | |
| wide_to_long(df, ["A", "B"], i="id", j="year") | |
| Notes | |
| ----- | |
| All extra variables are treated as extra id variables. | |
| """ | |
| def get_var_names(df, regex): | |
| return df.filter(regex=regex).columns.tolist() | |
| def melt_stub(df, stub, i, j): | |
| varnames = get_var_names(df, "^"+stub) | |
| newdf = pd.melt(df, id_vars=i, value_vars=varnames, | |
| value_name=stub, var_name=j) | |
| newdf[j] = newdf[j].str.replace(stub, "").astype(int) | |
| return newdf | |
| id_vars = get_var_names(df, "^(?!%s)" % "|".join(stubnames)) | |
| if i not in id_vars: | |
| id_vars += [i] | |
| stub = stubnames.pop(0) | |
| newdf = melt_stub(df, stub, id_vars, j) | |
| for stub in stubnames: | |
| new = melt_stub(df, stub, id_vars, j) | |
| newdf = newdf.merge(new, how="outer", on=id_vars + [j], copy=False) | |
| return newdf.set_index([i, j]) | |
| if __name__ == "__main__": | |
| url = "https://gist.github.com/jseabold/6652233/raw/44a04a78be85f5b3af16f26f51a143e047a10972/patr7079.csv" | |
| dta = pd.read_csv(url) | |
| dta["id"] = dta.index | |
| long_dta = wide_to_long(dta, ["LOGR", "PAT"], i="id", j="year") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment