Last active
June 30, 2025 03:29
-
-
Save pszemraj/a5308bd21b372129307514ee26d49f40 to your computer and use it in GitHub Desktop.
util script for loading, basic processing, converting reddit posts -> hf dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| util script for loading, basic processing, converting reddit posts -> hf dataset | |
| https://arctic-shift.photon-reddit.com/download-tool | |
| """ | |
| import pandas as pd | |
| from datasets import Dataset, load_dataset | |
| src = "./r_LocalLLaMA_posts.jsonl" # update with relevant path | |
| df = pd.read_json(src, lines=True).convert_dtypes() | |
| # by default wayyyy too many columns are included | |
| cols_mini = [col for col in df.columns if not "_" in col] | |
| dfm = df[cols_mini] | |
| dfm.info() | |
| # Drop columns that have only one unique value | |
| dfm_converted = dfm.applymap(lambda x: str(x) if isinstance(x, (list, dict)) else x) | |
| dfm_filtered = dfm_converted.dropna(axis=1, how="all").loc[ | |
| :, dfm_converted.nunique(dropna=True) > 1 | |
| ] | |
| dfm_filtered.info() | |
| # Reorder the DataFrame columns | |
| first_columns = ["title", "score", "selftext", "created", "url", "author"] | |
| remaining_columns = [col for col in dfm_filtered.columns if col not in first_columns] | |
| dfm_reordered = dfm_filtered[first_columns + remaining_columns] | |
| # Convert Unix timestamps to datetime | |
| dfm_reordered["created"] = pd.to_datetime( | |
| dfm_reordered["created"], unit="s", errors="coerce" | |
| ) | |
| dfm_reordered["edited"] = pd.to_datetime( | |
| dfm_reordered["edited"], unit="s", errors="coerce" | |
| ) | |
| ds = Dataset.from_pandas(dfm_reordered, preserve_index=False) | |
| print(ds) | |
| # ds.push_to_hub("pszemraj/LocalLLaMA-posts") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment