Skip to content

Instantly share code, notes, and snippets.

@alvarobartt
Created September 12, 2024 15:49
Show Gist options
  • Save alvarobartt/f6a762b5c2e915475878939d39b0e9c7 to your computer and use it in GitHub Desktop.
Save alvarobartt/f6a762b5c2e915475878939d39b0e9c7 to your computer and use it in GitHub Desktop.
DuckDB SQL query to datasets.Dataset
import duckdb
from datasets import Dataset
# Create DuckDB connection
con = duckdb.connect()
con.execute("INSTALL httpfs;")
con.execute("LOAD httpfs;")
# Query the dataset
query = """
SELECT
id,
ARRAY_TRANSFORM(
conversations,
x -> JSON({
'role': CASE
WHEN LOWER(x['from']) = 'human' THEN 'user'
ELSE x['from']
END,
'content': x['value']
})
) AS messages
FROM 'hf://datasets/NousResearch/hermes-function-calling-v1/func-calling-singleturn.json'
LIMIT 10;
"""
# Execute the query
result = con.execute(query)
# Transform to Arrow and then to Dataset
table = result.arrow()
dataset = Dataset(table)
# Dataset({
# features: ['id', 'messages'],
# num_rows: 1
# })
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment