brockmanmatt · October 20, 2019 23:54
diff --git a/Format TimeSeries DataFrame.py b/Format TimeSeries DataFrame.py
 mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"]

 #unnest the entries with multiple sources in them
 df = df.set_index(df.columns.drop('SOURCES',1).tolist()).SOURCES.str.split(';', expand=True).stack().reset_index().rename(columns={0:'SOURCES'}).loc[:, df.columns]

 df.DATE = df.DATE.apply(lambda x: str(x)) #convert date
 df.DATE = pd.to_datetime(df.DATE)
 df.fillna("", inplace=True)
 df.set_index("DATE", drop=True, inplace=True)

 df["dprk"] = df["LOCATIONS"].apply(lambda x: x.find("North Korea") > -1) #naive country mentions
 df["ukraine"] = df["LOCATIONS"].apply(lambda x: x.find("Ukraine") > -1)
 df["russia"] = df["LOCATIONS"].apply(lambda x: x.find("Russia") > -1)
 df["iran"] = df["LOCATIONS"].apply(lambda x: x.find("Iran") > -1)
 df["china"] = df["LOCATIONS"].apply(lambda x: x.find("China") > -1)

 loc_df = df.groupby(["SOURCES", "DATE"])[["dprk", "ukraine", "russia", "iran", "china"]].sum()

 mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"]

 time_series = pd.DataFrame() #build final DataFrame
 for publisher in mySources:
  time_series = pd.concat([time_series, loc_df.ix[publisher].add_prefix("{}_".format(publisher))], axis=1)
	mySources = ["cnn.com", "washingtonpost.com", "nytimes.com", "foxnews.com"]

	#unnest the entries with multiple sources in them
	df = df.set_index(df.columns.drop('SOURCES',1).tolist()).SOURCES.str.split(';', expand=True).stack().reset_index().rename(columns={0:'SOURCES'}).loc[:, df.columns]

	df.DATE = df.DATE.apply(lambda x: str(x)) #convert date
	df.DATE = pd.to_datetime(df.DATE)
	df.fillna("", inplace=True)
	df.set_index("DATE", drop=True, inplace=True)

	df["dprk"] = df["LOCATIONS"].apply(lambda x: x.find("North Korea") > -1) #naive country mentions
	df["ukraine"] = df["LOCATIONS"].apply(lambda x: x.find("Ukraine") > -1)
	df["russia"] = df["LOCATIONS"].apply(lambda x: x.find("Russia") > -1)
	df["iran"] = df["LOCATIONS"].apply(lambda x: x.find("Iran") > -1)
	df["china"] = df["LOCATIONS"].apply(lambda x: x.find("China") > -1)

	loc_df = df.groupby(["SOURCES", "DATE"])[["dprk", "ukraine", "russia", "iran", "china"]].sum()

	mySources = ["nytimes.com", "washingtonpost.com", "foxnews.com", "cnn.com"]

	time_series = pd.DataFrame() #build final DataFrame
	for publisher in mySources:
	time_series = pd.concat([time_series, loc_df.ix[publisher].add_prefix("{}_".format(publisher))], axis=1)
No results found