Created
September 12, 2024 15:49
-
-
Save alvarobartt/f6a762b5c2e915475878939d39b0e9c7 to your computer and use it in GitHub Desktop.
DuckDB SQL query to datasets.Dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import duckdb | |
from datasets import Dataset | |
# Create DuckDB connection | |
con = duckdb.connect() | |
con.execute("INSTALL httpfs;") | |
con.execute("LOAD httpfs;") | |
# Query the dataset | |
query = """ | |
SELECT | |
id, | |
ARRAY_TRANSFORM( | |
conversations, | |
x -> JSON({ | |
'role': CASE | |
WHEN LOWER(x['from']) = 'human' THEN 'user' | |
ELSE x['from'] | |
END, | |
'content': x['value'] | |
}) | |
) AS messages | |
FROM 'hf://datasets/NousResearch/hermes-function-calling-v1/func-calling-singleturn.json' | |
LIMIT 10; | |
""" | |
# Execute the query | |
result = con.execute(query) | |
# Transform to Arrow and then to Dataset | |
table = result.arrow() | |
dataset = Dataset(table) | |
# Dataset({ | |
# features: ['id', 'messages'], | |
# num_rows: 1 | |
# }) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment