Skip to content

Instantly share code, notes, and snippets.

-- s1 is not used
WITH s1 AS (
SELECT
linkedin_slug
, count(*) OVER (PARTITION BY 1)
, max(updated_at) max_updated_at
FROM person
GROUP BY linkedin_slug
HAVING count(*) > 1
)
@xoelop
xoelop / ingest_data_to_tb_deduplicate.sh
Last active April 23, 2023 19:43
Deduplicating rows on Tinybird almost on ingestion time
# full process. Ingests data from postgres to tb, calculates duplicates, inserts them to a new datasource and removes the rows from that datasource that appear in the original one.
source $(pwd)/.env
# related, to ingest data from postgres: https://blog.tinybird.co/2019/10/14/the-one-cron-job-that-will-speed-up-your-analytical-queries-in-postgres-a-hundred-fold/
echo 'Ingesting most recently update jobs'
psql $HEROKU_POSTGRES_URL -c "COPY (SELECT id, url, job_title, company, description, description_cleaned, date_posted, now() FROM job WHERE COALESCE(description, '') <> '' AND updated_at > now() - interval '70 minutes') TO STDOUT WITH (FORMAT CSV)" | curl -F csv=@- "https://api.tinybird.co/v0/datasources?name=jobs&mode=append&token=$TINYBIRD_ADMIN_TOKEN";
@xoelop
xoelop / convert_to_csv_middleware.py
Last active January 17, 2022 16:53
A middleware that transforms the JSON output from a FastAPI endpoint into CSV content
import json
import io
from fastapi.responses import StreamingResponse
from fastapi import Request, Response
from fastapi import FastAPI
app = FastAPI()
@xoelop
xoelop / Stock charts with dynamic tooltip.json
Created May 10, 2021 12:21
Stock charts with dynamic tooltip
{
"$schema": "https://vega.github.io/schema/vega-lite/v5.json",
"data": {"url": "data/stocks.csv"},
"width": 400,
"height": 300,
"encoding": {"x": {"field": "date", "type": "temporal"}},
"layer": [
{
"encoding": {
"color": {"field": "symbol", "type": "nominal"},
@xoelop
xoelop / create_events_table_postgres_inserts_data.sql
Created March 17, 2021 10:33
Creates a postgres db locally, runs postgres server and inserts events data
DROP TABLE IF EXISTS events;
CREATE TABLE events
(
date timestamp,
product_id text,
user_id bigint,
event text,
extra_data json
);
@xoelop
xoelop / regexp_quote.sql
Last active March 10, 2021 13:23
Function to escape special characters in a pattern in Postgres. This is useful if you want to match an arbitrary literal string that may have regular expression metacharacters in it
CREATE OR REPLACE FUNCTION public.regexp_quote(IN TEXT)
RETURNS TEXT
LANGUAGE plpgsql
STABLE
AS $$
/*******************************************************************************
* Function Name: regexp_quote
* In-coming Param:
* The string to decoded and convert into a set of text arrays.
* Returns:
# loads the result of a query on Tinybird to a Pandas DataFrame
import pandas as pd
import urllib.parse
def build_tinybird_query_url(query: str, token: str) -> str:
url = f"https://api.tinybird.co/v0/sql?q={urllib.parse.quote(query + ' FORMAT CSVWithNames')}&token={token}"
return url
import requests
import urllib.parse
import json
def jprint(data: str):
"""Prints JSON-like string (data) nicely"""
print(json.dumps(data, indent=4, ensure_ascii=False))
def ingest_data(datasource: str,
SELECT
date,
link,
tweet_text,
ifNull(url, '') url,
ifNull(title, '') title,
ifNull(description, '') description,
media,
format('https://twitter.com/{}', screenname) user_link,
format('{} (@{})', username, screenname) user_name_handle,
%
SELECT * FROM all_likes
WHERE match(lower(text), replaceRegexpAll({{String(search_term, ' ', description="Return tweets whose text matches this word (can include a regex pattern)", required=True)}}, ' ', '.?'))
AND (
multiSearchAnyCaseInsensitive(screenname, splitByString(',', {{String(usernames, ',', description="User names or handles, comma-separated")}}))
OR multiSearchAnyCaseInsensitive(username, splitByString(',', {{String(usernames, ',', description="User names or handles, comma-separated")}}))
)
AND date >= toStartOfDay(toDate({{Date(since, '2000-09-09', description="Start date, YYYY-MM-DD")}}))
AND date <= addDays(toStartOfDay(toDate({{Date(until, '2100-09-09', description="End date, YYYY-MM-DD")}})), 1)
ORDER BY date desc