Skip to content

Instantly share code, notes, and snippets.

import pdfplumber
import PyPDF2
# Open the PDF file with PyPDF2
pdf_file = 'example.pdf'
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Extract text with PyPDF2
full_text = ""
# data_pipeline.py
from airflow import DAG
from airflow.providers.http.operators.http import HttpOperator
from airflow.operators.python import PythonOperator
import base64
from datetime import datetime, timedelta
import io
import zipfile
import pandas as pd
version: '3'
services:
postgres:
image: postgres:13
environment:
- POSTGRES_USER=airflow
- POSTGRES_PASSWORD=airflow
- POSTGRES_DB=airflow
airflow-init:
import requests
from pdfminer.high_level import extract_text
from collections import Counter
import re
# Step 1: Download and save the PDF
pdf_url = "https://arxiv.org/pdf/1509.02971.pdf"
response = requests.get(pdf_url)
pdf_filename = "nasa_flight_plan.pdf"
import requests
import pandas as pd
from datetime import datetime, timedelta
import os
def process_earthquake_data():
today_date = datetime.now()
timeDelta = timedelta(days=1)
import psycopg2
import os
def delete_old_records(conn, csv_file_path):
# create the delete query
delete_query = "DELETE FROM earthquake WHERE filename=%s"
cur = conn.cursor()
cur.execute(delete_query, (csv_file_path,))
import psycopg2
import os
def delete_old_records(conn, start_date, end_date):
delete_query = """DELETE FROM raw.public.stage_earthquake
WHERE dt BETWEEN %s AND %s
"""
cur = conn.cursor()
from airflow.decorators import dag, task
from datetime import datetime, timedelta
import requests
import json
import os
from google.cloud import storage
from airflow.operators.bash import BashOperator
default_args = {
"email_on_failure": False,
from airflow import DAG
from airflow.operators.python import PythonOperator
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryExecuteQueryOperator,
)
from datetime import datetime
from google.cloud import storage, bigquery
import requests
import zipfile
import os
from airflow import DAG
from airflow.operators.dummy import DummyOperator
from airflow.providers.google.cloud.transfers.gcs_to_bigquery import (
GCSToBigQueryOperator,
)
from airflow.providers.google.cloud.operators.bigquery import (
BigQueryExecuteQueryOperator,
BigQueryInsertJobOperator,
BigQueryTableCheckOperator,
)