Created
January 26, 2023 05:45
-
-
Save ncoop57/f700ea0980ed6d19673440ac19e5c457 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import boto3 | |
s3 = boto3.resource("s3") | |
my_bucket = s3.Bucket("s-eai-neox") | |
file_paths = [] | |
for my_bucket_object in my_bucket.objects.filter(Prefix="data/codepile/group1/"): | |
# print(my_bucket_object.key) | |
file_paths.append(f"s3a://s-eai-neox/{my_bucket_object.key}") | |
print(len(file_paths)) | |
from spark_session_builder import build_spark_session | |
file_paths = file_paths[100:200] | |
spark = build_spark_session("spark://cpu64-dy-r6i-16xlarge-9:7077", 32, 256) | |
data = spark.read.parquet(*file_paths) | |
data.show() | |
# data.filter(data.meta.contains("arXiv_out")).show() | |
data.filter(data.meta.contains("Project Gutenberg")).show() | |
data.filter(data.meta.contains("Ubuntu IRC")).show() | |
data.filter(data.meta.contains("USPTO-Application")).show() | |
data.filter(data.meta.contains("S2ORC")).show() | |
# data.filter(data.meta.contains("arXiv_out")).show(truncate=False) | |
# data.filter(data.meta.contains("arXiv_out")).show(truncate=False) | |
# data.filter(data.meta.contains("arXiv_out")).show(truncate=False) | |
# "PubMed_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "PubMedDataset", | |
# "Gutenberg_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "Project Gutenberg", | |
# "FreeLaw_Options_ver2" : lambda example : "date_created" in example.keys(), | |
# "UbuntuIRC_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "Ubuntu IRC", | |
# "Enwiki_ver2" : lambda example : "wikidata_id" in example.keys(), | |
# "EuroParliamentProceedings_ver2" : lambda example : "language" in example.keys(), | |
# "USPTO_ver2" : lambda example : ast.literal_eval(example["meta"]["source_data"]) == "USPTO-Application", | |
# "PileOfLaw_ver2" : lambda example : "dataset" in example.keys(), | |
# "OtherWiki_ver2" : lambda example : "wiki_source" in example.keys(), | |
# "S2ORC_ver2" : lambda example : ast.literal_eval(example["meta"]["source"]) == "S2ORC", |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment