Skip to content

Instantly share code, notes, and snippets.

@SultanOrazbayev
Last active September 18, 2024 03:40
Show Gist options
  • Save SultanOrazbayev/3a7401850dc84f363e6bb7d8353088ed to your computer and use it in GitHub Desktop.
Save SultanOrazbayev/3a7401850dc84f363e6bb7d8353088ed to your computer and use it in GitHub Desktop.
GDELT v2 processing
from re import RegexFlag, compile
from typing import Literal
from pandas import DataFrame, concat, read_csv
MASTER_LIST_URL = "http://data.gdeltproject.org/gdeltv2/masterfilelist.txt"
master_list = read_csv(MASTER_LIST_URL, sep=" ", names=["size", "hash", "url"])
pattern = compile(
r"http://data\.gdeltproject\.org/gdeltv2/(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})\.(?P<dataset>\w+)\.CSV\.zip",
flags=RegexFlag.IGNORECASE,
)
master_list[["year", "month", "day", "hour", "minute", "second", "dataset"]] = master_list["url"].str.extract(pat=pattern)
master_list["date"] = master_list[["year", "month", "day"]].apply(lambda x: "-".join(str(_) for _ in x), axis=1)
columns_export = [
"GLOBALEVENTID",
"SQLDATE",
"MonthYear",
"Year",
"FractionDate",
"Actor1Code",
"Actor1Name",
"Actor1CountryCode",
"Actor1KnownGroupCode",
"Actor1EthnicCode",
"Actor1Religion1Code",
"Actor1Religion2Code",
"Actor1Type1Code",
"Actor1Type2Code",
"Actor1Type3Code",
"Actor2Code",
"Actor2Name",
"Actor2CountryCode",
"Actor2KnownGroupCode",
"Actor2EthnicCode",
"Actor2Religion1Code",
"Actor2Religion2Code",
"Actor2Type1Code",
"Actor2Type2Code",
"Actor2Type3Code",
"IsRootEvent",
"EventCode",
"EventBaseCode",
"EventRootCode",
"QuadClass",
"GoldsteinScale",
"NumMentions",
"NumSources",
"NumArticles",
"AvgTone",
"Actor1Geo_Type",
"Actor1Geo_FullName",
"Actor1Geo_CountryCode",
"Actor1Geo_ADM1Code",
"Actor1Geo_ADM2Code",
"Actor1Geo_Lat",
"Actor1Geo_Long",
"Actor1Geo_FeatureID",
"Actor2Geo_Type",
"Actor2Geo_FullName",
"Actor2Geo_CountryCode",
"Actor2Geo_ADM1Code",
"Actor2Geo_ADM2Code",
"Actor2Geo_Lat",
"Actor2Geo_Long",
"Actor2Geo_FeatureID",
"ActionGeo_Type",
"ActionGeo_FullName",
"ActionGeo_CountryCode",
"ActionGeo_ADM1Code",
"ActionGeo_ADM2Code",
"ActionGeo_Lat",
"ActionGeo_Long",
"ActionGeo_FeatureID",
"DATEADDED",
"SOURCEURL",
]
columns_mentions = [
"GLOBALEVENTID",
"EventTimeDate",
"MentionTimeDate",
"MentionType",
"MentionSourceName",
"MentionIdentifier",
"SentenceID",
"Actor1CharOffset",
"Actor2CharOffset",
"ActionCharOffset",
"InRawText",
"Confidence",
"MentionDocLen",
"MentionDocTone",
"MentionDocTranslationInfo",
"Extras",
]
columns_gkg = [
"GKGRECORDID",
"DATE",
"SourceCollectionIdentifier",
"SourceCommonName",
"DocumentIdentifier",
"Counts",
"V2Counts",
"Themes",
"V2Themes",
"Locations",
"V2Locations",
"Persons",
"V2Persons",
"Organizations",
"V2Organizations",
"V2Tone",
"Dates",
"GCAM",
"SharingImage",
"RelatedImageEmbeds",
"SocialImageEmbeds",
"SocialVideoEmbeds",
"Quotations",
"AllNames",
"Amounts",
"TranslationInfo",
"Extras",
]
def download_data(date: str = "2024-08-27", dataset: Literal["export", "mentions", "gkg"] = "export") -> DataFrame:
"""Download the concatenated data for a given date and dataset."""
dfs = []
column_names = globals()[f"columns_{dataset}"]
for _, row in master_list.query("(date==@date) & (dataset==@dataset)").iterrows():
df = read_csv(row.url, sep="\t", names=column_names)
dfs.append(df)
return concat(dfs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment