Skip to content

Instantly share code, notes, and snippets.

@olivergeorge
Created June 4, 2025 01:13
Show Gist options
  • Save olivergeorge/7ec76813fd229b296f9b3ff77aab8e5b to your computer and use it in GitHub Desktop.
Save olivergeorge/7ec76813fd229b296f9b3ff77aab8e5b to your computer and use it in GitHub Desktop.

Gmail Extractor

A Python tool to extract and export Gmail emails to JSONL format using the Gmail API.

Setup

  1. Install dependencies:
    pip install -r requirements.in

Usage

Run the script:

python gmail_extractor.py

The tool will prompt you to enter a search query. You can use Gmail's search syntax to filter emails.

Authentication

On first run, the tool will open a web browser for Gmail authentication. Credentials are saved locally in token.pickle for subsequent runs.

Search Examples

Search by Label

label:important

This will find all emails with the "important" label. Note that the count shown is per email, not per thread.

Other Search Options

  • from:[email protected] - Emails from specific sender
  • subject:invoice - Emails with "invoice" in subject
  • has:attachment - Emails with attachments
  • after:2024/01/01 before:2024/12/31 - Emails within date range
  • is:unread - Unread emails (default)

Output

The tool exports emails to a JSONL file named gmail_export_YYYYMMDD_HHMMSS.jsonl containing:

  • Email ID and Thread ID
  • Date, From, To, Subject
  • Email snippet and full body text

Each email is saved as a separate JSON object on its own line.

Provisioning serves and getting credentials

Already done. The creds file is in this folder.

The process to generate it was...

Set up Gmail API credentials:

  • Go to Google Cloud Console
  • Create a new project or select existing one
  • Enable Gmail API
  • Create OAuth 2.0 credentials
  • Download the credentials JSON file and place it in the project directory
import os
import pickle
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
import base64
import email
import json
from datetime import datetime
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
CREDENTIALS_FILE = 'client_secret_74136561182-bhro4ocvfdqmhdvqgktn6nur5bldd0d8.apps.googleusercontent.com.json'
def authenticate_gmail():
"""Authenticate and return Gmail service object"""
creds = None
# Token file stores user's access and refresh tokens
if os.path.exists('token.pickle'):
with open('token.pickle', 'rb') as token:
creds = pickle.load(token)
# If there are no valid credentials, request authorization
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
CREDENTIALS_FILE, SCOPES)
creds = flow.run_local_server(port=0)
# Save credentials for next run
with open('token.pickle', 'wb') as token:
pickle.dump(creds, token)
return build('gmail', 'v1', credentials=creds)
def search_emails(service, query, max_results=1000):
"""Search for emails matching the query"""
try:
print(f"DEBUG: Executing query: '{query}'")
all_messages = []
page_token = None
while len(all_messages) < max_results:
results = service.users().messages().list(
userId='me',
q=query,
maxResults=min(500, max_results - len(all_messages)),
pageToken=page_token
).execute()
messages = results.get('messages', [])
all_messages.extend(messages)
print(f"DEBUG: Got {len(messages)} messages in this page, total so far: {len(all_messages)}")
page_token = results.get('nextPageToken')
if not page_token:
break
print(f"DEBUG: Final count: {len(all_messages)} messages")
return all_messages[:max_results]
except Exception as error:
print(f'An error occurred: {error}')
return []
def get_email_details(service, message_id):
"""Get detailed information about a specific email"""
try:
message = service.users().messages().get(
userId='me',
id=message_id,
format='full'
).execute()
payload = message['payload']
headers = payload.get('headers', [])
# Extract common header fields
email_data = {
'id': message_id,
'thread_id': message.get('threadId'),
'snippet': message.get('snippet'),
'date': '',
'from': '',
'to': '',
'subject': '',
'body': ''
}
# Parse headers
for header in headers:
name = header['name'].lower()
if name == 'date':
email_data['date'] = header['value']
elif name == 'from':
email_data['from'] = header['value']
elif name == 'to':
email_data['to'] = header['value']
elif name == 'subject':
email_data['subject'] = header['value']
# Extract body
email_data['body'] = extract_body(payload)
return email_data
except Exception as error:
print(f'An error occurred getting email details: {error}')
return None
def extract_body(payload):
"""Extract email body from payload"""
body = ""
if 'parts' in payload:
for part in payload['parts']:
if part['mimeType'] == 'text/plain':
data = part['body']['data']
body = base64.urlsafe_b64decode(data).decode('utf-8')
break
else:
if payload['mimeType'] == 'text/plain':
data = payload['body']['data']
body = base64.urlsafe_b64decode(data).decode('utf-8')
return body
def export_to_jsonl(emails, filename='extracted_emails.jsonl'):
"""Export emails to JSONL file"""
with open(filename, 'w', encoding='utf-8') as jsonlfile:
for email_data in emails:
jsonlfile.write(json.dumps(email_data) + '\n')
print(f"Exported {len(emails)} emails to {filename}")
def main():
# Authenticate
print("Authenticating with Gmail API...")
service = authenticate_gmail()
# Example search queries - modify as needed
search_queries = [
# "from:[email protected]",
# "subject:invoice",
# "has:attachment",
# "after:2024/01/01 before:2024/12/31",
"is:unread" # Default: get unread emails
]
print("Enter your search query (or press Enter for unread emails):")
user_query = input().strip()
if user_query:
query = user_query
else:
query = "is:unread"
print(f"Searching for emails with query: {query}")
# Search for emails
messages = search_emails(service, query, max_results=1000)
if not messages:
print("No emails found.")
return
print(f"Found {len(messages)} emails. Extracting details...")
# Get detailed information for each email
emails = []
for i, message in enumerate(messages):
print(f"Processing email {i+1}/{len(messages)}")
email_data = get_email_details(service, message['id'])
if email_data:
emails.append(email_data)
# Export to JSONL
if emails:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"gmail_export_{timestamp}.jsonl"
export_to_jsonl(emails, filename)
else:
print("No email data extracted.")
if __name__ == '__main__':
main()
google-auth
google-auth-oauthlib
google-auth-httplib2
google-api-python-client
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment