SultanOrazbayev · September 18, 2024 03:40
diff --git a/gistfile1.txt b/gistfile1.txt
 from re import RegexFlag, compile
 from typing import Literal

 from pandas import DataFrame, concat, read_csv

 MASTER_LIST_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

 master_list = read_csv(MASTER_LIST_URL, sep=" ", names=["size", "hash", "url"])

 pattern = compile(
    r"http://data\.gdeltproject\.org/gdeltv2/(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})\.(?P<dataset>\w+)\.CSV\.zip",
    flags=RegexFlag.IGNORECASE,
 )
 master_list[["year", "month", "day", "hour", "minute", "second", "dataset"]] = master_list["url"].str.extract(pat=pattern)
 master_list["date"] = master_list[["year", "month", "day"]].apply(lambda x: "-".join(str(_) for _ in x), axis=1)


 columns_export = [
    "GLOBALEVENTID",
    "SQLDATE",
    "MonthYear",
    "Year",
    "FractionDate",
    "Actor1Code",
    "Actor1Name",
    "Actor1CountryCode",
    "Actor1KnownGroupCode",
    "Actor1EthnicCode",
    "Actor1Religion1Code",
    "Actor1Religion2Code",
    "Actor1Type1Code",
    "Actor1Type2Code",
    "Actor1Type3Code",
    "Actor2Code",
    "Actor2Name",
    "Actor2CountryCode",
    "Actor2KnownGroupCode",
    "Actor2EthnicCode",
    "Actor2Religion1Code",
    "Actor2Religion2Code",
    "Actor2Type1Code",
    "Actor2Type2Code",
    "Actor2Type3Code",
    "IsRootEvent",
    "EventCode",
    "EventBaseCode",
    "EventRootCode",
    "QuadClass",
    "GoldsteinScale",
    "NumMentions",
    "NumSources",
    "NumArticles",
    "AvgTone",
    "Actor1Geo_Type",
    "Actor1Geo_FullName",
    "Actor1Geo_CountryCode",
    "Actor1Geo_ADM1Code",
    "Actor1Geo_ADM2Code",
    "Actor1Geo_Lat",
    "Actor1Geo_Long",
    "Actor1Geo_FeatureID",
    "Actor2Geo_Type",
    "Actor2Geo_FullName",
    "Actor2Geo_CountryCode",
    "Actor2Geo_ADM1Code",
    "Actor2Geo_ADM2Code",
    "Actor2Geo_Lat",
    "Actor2Geo_Long",
    "Actor2Geo_FeatureID",
    "ActionGeo_Type",
    "ActionGeo_FullName",
    "ActionGeo_CountryCode",
    "ActionGeo_ADM1Code",
    "ActionGeo_ADM2Code",
    "ActionGeo_Lat",
    "ActionGeo_Long",
    "ActionGeo_FeatureID",
    "DATEADDED",
    "SOURCEURL",
 ]


 columns_mentions = [
    "GLOBALEVENTID",
    "EventTimeDate",
    "MentionTimeDate",
    "MentionType",
    "MentionSourceName",
    "MentionIdentifier",
    "SentenceID",
    "Actor1CharOffset",
    "Actor2CharOffset",
    "ActionCharOffset",
    "InRawText",
    "Confidence",
    "MentionDocLen",
    "MentionDocTone",
    "MentionDocTranslationInfo",
    "Extras",
 ]


 columns_gkg = [
    "GKGRECORDID",
    "DATE",
    "SourceCollectionIdentifier",
    "SourceCommonName",
    "DocumentIdentifier",
    "Counts",
    "V2Counts",
    "Themes",
    "V2Themes",
    "Locations",
    "V2Locations",
    "Persons",
    "V2Persons",
    "Organizations",
    "V2Organizations",
    "V2Tone",
    "Dates",
    "GCAM",
    "SharingImage",
    "RelatedImageEmbeds",
    "SocialImageEmbeds",
    "SocialVideoEmbeds",
    "Quotations",
    "AllNames",
    "Amounts",
    "TranslationInfo",
    "Extras",
 ]


 def download_data(date: str = "2024-08-27", dataset: Literal["export", "mentions", "gkg"] = "export") -> DataFrame:
    """Download the concatenated data for a given date and dataset."""
    dfs = []
    column_names = globals()[f"columns_{dataset}"]
    for _, row in master_list.query("(date==@date) & (dataset==@dataset)").iterrows():
        df = read_csv(row.url, sep="\t", names=column_names)
        dfs.append(df)
    return concat(dfs)
	from re import RegexFlag, compile
	from typing import Literal

	from pandas import DataFrame, concat, read_csv

	MASTER_LIST_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"

	master_list = read_csv(MASTER_LIST_URL, sep=" ", names=["size", "hash", "url"])

	pattern = compile(
	r"http://data\.gdeltproject\.org/gdeltv2/(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})\.(?P<dataset>\w+)\.CSV\.zip",
	flags=RegexFlag.IGNORECASE,
	)
	master_list[["year", "month", "day", "hour", "minute", "second", "dataset"]] = master_list["url"].str.extract(pat=pattern)
	master_list["date"] = master_list[["year", "month", "day"]].apply(lambda x: "-".join(str(_) for _ in x), axis=1)


	columns_export = [
	"GLOBALEVENTID",
	"SQLDATE",
	"MonthYear",
	"Year",
	"FractionDate",
	"Actor1Code",
	"Actor1Name",
	"Actor1CountryCode",
	"Actor1KnownGroupCode",
	"Actor1EthnicCode",
	"Actor1Religion1Code",
	"Actor1Religion2Code",
	"Actor1Type1Code",
	"Actor1Type2Code",
	"Actor1Type3Code",
	"Actor2Code",
	"Actor2Name",
	"Actor2CountryCode",
	"Actor2KnownGroupCode",
	"Actor2EthnicCode",
	"Actor2Religion1Code",
	"Actor2Religion2Code",
	"Actor2Type1Code",
	"Actor2Type2Code",
	"Actor2Type3Code",
	"IsRootEvent",
	"EventCode",
	"EventBaseCode",
	"EventRootCode",
	"QuadClass",
	"GoldsteinScale",
	"NumMentions",
	"NumSources",
	"NumArticles",
	"AvgTone",
	"Actor1Geo_Type",
	"Actor1Geo_FullName",
	"Actor1Geo_CountryCode",
	"Actor1Geo_ADM1Code",
	"Actor1Geo_ADM2Code",
	"Actor1Geo_Lat",
	"Actor1Geo_Long",
	"Actor1Geo_FeatureID",
	"Actor2Geo_Type",
	"Actor2Geo_FullName",
	"Actor2Geo_CountryCode",
	"Actor2Geo_ADM1Code",
	"Actor2Geo_ADM2Code",
	"Actor2Geo_Lat",
	"Actor2Geo_Long",
	"Actor2Geo_FeatureID",
	"ActionGeo_Type",
	"ActionGeo_FullName",
	"ActionGeo_CountryCode",
	"ActionGeo_ADM1Code",
	"ActionGeo_ADM2Code",
	"ActionGeo_Lat",
	"ActionGeo_Long",
	"ActionGeo_FeatureID",
	"DATEADDED",
	"SOURCEURL",
	]


	columns_mentions = [
	"GLOBALEVENTID",
	"EventTimeDate",
	"MentionTimeDate",
	"MentionType",
	"MentionSourceName",
	"MentionIdentifier",
	"SentenceID",
	"Actor1CharOffset",
	"Actor2CharOffset",
	"ActionCharOffset",
	"InRawText",
	"Confidence",
	"MentionDocLen",
	"MentionDocTone",
	"MentionDocTranslationInfo",
	"Extras",
	]


	columns_gkg = [
	"GKGRECORDID",
	"DATE",
	"SourceCollectionIdentifier",
	"SourceCommonName",
	"DocumentIdentifier",
	"Counts",
	"V2Counts",
	"Themes",
	"V2Themes",
	"Locations",
	"V2Locations",
	"Persons",
	"V2Persons",
	"Organizations",
	"V2Organizations",
	"V2Tone",
	"Dates",
	"GCAM",
	"SharingImage",
	"RelatedImageEmbeds",
	"SocialImageEmbeds",
	"SocialVideoEmbeds",
	"Quotations",
	"AllNames",
	"Amounts",
	"TranslationInfo",
	"Extras",
	]


	def download_data(date: str = "2024-08-27", dataset: Literal["export", "mentions", "gkg"] = "export") -> DataFrame:
	"""Download the concatenated data for a given date and dataset."""
	dfs = []
	column_names = globals()[f"columns_{dataset}"]
	for _, row in master_list.query("(date==@date) & (dataset==@dataset)").iterrows():
	df = read_csv(row.url, sep="\t", names=column_names)
	dfs.append(df)
	return concat(dfs)