tmonjalo/list-fftabs.py

kth8 · 2024-10-02T21:06:16Z

I was looking for a way to list my open Firefox tabs and found this after trying other solutions which were outdated. I use the Firefox flatpak with single window so I asked AI to refactor @RanTalbott's script and this seems to do what I want:

#! /usr/bin/env python3
import argparse
import pathlib
import lz4.block
import json
from urllib.parse import urlparse

def parse_arguments():
    parser = argparse.ArgumentParser(description="Extract open tab titles and URLs from Firefox sessionstore backups.")
    parser.add_argument('--profile', type=str, default='default', help="The Firefox profile name (default: 'default').")
    parser.add_argument('--mozilla-path', type=str, default='~/.var/app/org.mozilla.firefox/.mozilla/firefox', help="The base path to the Firefox profiles directory (default: '~/.var/app/org.mozilla.firefox/.mozilla/firefox').")
    return parser.parse_args()

def get_session_files(mozilla_path, profile):
    path = pathlib.Path(mozilla_path).expanduser()
    return path.glob(f'*{profile}*/sessionstore-backups/recovery.*')

def read_and_decompress_file(file_path):
    try:
        b = file_path.read_bytes()
        if b.startswith(b'mozLz40\0'):
            return lz4.block.decompress(b[8:])
        else:
            print(f"Skipping non-LZ4 file: {file_path}")
            return None
    except (lz4.block.LZ4BlockError, FileNotFoundError) as e:
        print(f"Error reading or decompressing file {file_path}: {e}")
        return None

def parse_json_data(data, file_path):
    try:
        return json.loads(data)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON data from file {file_path}: {e}")
        return None

def extract_titles_from_session(session_data, unique_titles, file_path):
    for window in session_data.get('windows', []):
        if not isinstance(window, dict):
            print(f"Invalid window structure in file {file_path}")
            continue
        for tab in window.get('tabs', []):
            if not isinstance(tab, dict):
                print(f"Invalid tab structure in file {file_path}")
                continue
            index = tab.get('index', 0) - 1
            entries = tab.get('entries', [])
            if not isinstance(entries, list):
                print(f"Invalid entries structure in file {file_path}")
                continue
            if 0 <= index < len(entries):
                entry = entries[index]
                if not isinstance(entry, dict):
                    print(f"Invalid entry structure in file {file_path}")
                    continue
                title = entry.get('title', 'Untitled')
                url = entry.get('url', '')
                if url:
                    try:
                        website = urlparse(url).netloc
                        if website.startswith("www."):
                            website = website[4:]
                        unique_titles.add(f"{website} - {title}")
                    except ValueError:
                        print(f"Invalid URL format in file {file_path}: {url}")

def main():
    args = parse_arguments()
    files = get_session_files(args.mozilla_path, args.profile)
    unique_titles = set()
    for f in files:
        if not f.exists() or not f.is_file():
            print(f"Skipping invalid or inaccessible file: {f}")
            continue
        data = read_and_decompress_file(f)
        if data is None:
            continue
        session_data = parse_json_data(data, f)
        if session_data is None:
            continue
        extract_titles_from_session(session_data, unique_titles, f)
    for title in unique_titles:
        print(title)

if __name__ == "__main__":
    main()

mabra · 2024-12-09T02:05:40Z

Thanks for this script.
Does someone know if and how it isposisble under Linux to get the workspace (number or name)
from a tab('s title)? Due to the nature of window hierarachy (so far I understand it), tools like
'xdotool' are unable to map a title to a workspace (works partially only) - can this script be of help?

Joshfindit · 2025-05-17T22:51:40Z

Thanks to @tmonjalo and @kth8

Here's a version that builds on it to add JSON export as well as the history for each tab:

#! /usr/bin/env python3
import argparse
import pathlib
import lz4.block
import json
from urllib.parse import urlparse
import datetime
import sys

def parse_arguments():
    parser = argparse.ArgumentParser(description="Extract open tab titles and URLs from Firefox sessionstore backups.")
    parser.add_argument('--profile', type=str, default='default', help="The Firefox profile name (default: 'default').")
    parser.add_argument('--mozilla-path', type=str, default='~/.var/app/org.mozilla.firefox/.mozilla/firefox', help="The base path to the Firefox profiles directory (default: '~/.var/app/org.mozilla.firefox/.mozilla/firefox').")
    parser.add_argument('--output', type=str, default='tabs_export.json', help="Output JSON file path (default: 'tabs_export.json').")
    parser.add_argument('--show-schema', action='store_true', help="Display the output JSON schema and exit.")
    return parser.parse_args()

def get_session_files(mozilla_path, profile):
    path = pathlib.Path(mozilla_path).expanduser()
    return path.glob(f'*{profile}*/sessionstore-backups/recovery.*')

def read_and_decompress_file(file_path):
    try:
        b = file_path.read_bytes()
        if b.startswith(b'mozLz40\0'):
            return lz4.block.decompress(b[8:])
        else:
            print(f"Skipping non-LZ4 file: {file_path}")
            return None
    except (lz4.block.LZ4BlockError, FileNotFoundError) as e:
        print(f"Error reading or decompressing file {file_path}: {e}")
        return None

def parse_json_data(data, file_path):
    try:
        return json.loads(data)
    except json.JSONDecodeError as e:
        print(f"Error parsing JSON data from file {file_path}: {e}")
        return None

def format_timestamp(timestamp):
    """Convert Firefox timestamp to readable format if available"""
    if not timestamp:
        return None
    # Firefox uses microseconds since epoch
    try:
        return datetime.datetime.fromtimestamp(timestamp/1000000).isoformat()
    except (ValueError, TypeError, OverflowError):
        return str(timestamp)  # Return original if conversion fails

def extract_tabs_from_session(session_data, file_path):
    tabs_data = []

    for window_idx, window in enumerate(session_data.get('windows', [])):
        if not isinstance(window, dict):
            print(f"Invalid window structure in file {file_path}")
            continue

        for tab_idx, tab in enumerate(window.get('tabs', [])):
            if not isinstance(tab, dict):
                print(f"Invalid tab structure in file {file_path}")
                continue

            tab_data = {
                "window_index": window_idx,
                "tab_index": tab_idx,
                "history": []
            }

            # Extract last access time if available
            if "lastAccessed" in tab:
                tab_data["last_accessed"] = tab.get("lastAccessed")

            # Get current position in history
            current_index = tab.get('index', 0) - 1
            tab_data["current_index"] = current_index

            # Extract history entries
            entries = tab.get('entries', [])
            if not isinstance(entries, list):
                print(f"Invalid entries structure in file {file_path}")
                continue

            for entry_idx, entry in enumerate(entries):
                if not isinstance(entry, dict):
                    print(f"Invalid entry structure in file {file_path}")
                    continue

                history_entry = {
                    "entry_index": entry_idx,
                    "title": entry.get('title', 'Untitled'),
                    "url": entry.get('url', ''),
                    "is_current": entry_idx == current_index
                }

                # Extract timestamp if available
                if "lastAccessed" in entry:
                    history_entry["accessed_at"] = format_timestamp(entry.get("lastAccessed"))

                # Extract scroll position
                if "scroll" in entry and isinstance(entry["scroll"], dict):
                    scroll_data = entry.get("scroll", {})
                    history_entry["scroll_position"] = {
                        "x": scroll_data.get("scroll", {}).get("x", 0) if isinstance(scroll_data.get("scroll"), dict) else 0,
                        "y": scroll_data.get("scroll", {}).get("y", 0) if isinstance(scroll_data.get("scroll"), dict) else 0
                    }

                tab_data["history"].append(history_entry)

            # Set current page info
            if 0 <= current_index < len(entries):
                current_entry = entries[current_index]
                tab_data["current_title"] = current_entry.get('title', 'Untitled')
                tab_data["current_url"] = current_entry.get('url', '')

                # Domain for convenience
                try:
                    domain = urlparse(current_entry.get('url', '')).netloc
                    if domain.startswith("www."):
                        domain = domain[4:]
                    tab_data["domain"] = domain
                except ValueError:
                    tab_data["domain"] = ""

            tabs_data.append(tab_data)

    return tabs_data

def display_schema():
    """Display the JSON schema of the output."""
    schema = {
        "$schema": "http://json-schema.org/draft-07/schema#",
        "title": "Firefox Tab Export",
        "type": "object",
        "properties": {
            "export_date": {
                "type": "string",
                "format": "date-time",
                "description": "ISO8601 timestamp of when the export was generated"
            },
            "total_tabs": {
                "type": "integer",
                "description": "Total number of tabs exported"
            },
            "tabs": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "window_index": {
                            "type": "integer",
                            "description": "Index of the window containing this tab"
                        },
                        "tab_index": {
                            "type": "integer",
                            "description": "Index of this tab within its window"
                        },
                        "last_accessed": {
                            "type": ["integer", "null"],
                            "format": "milliseconds",
                            "description": "Timestamp of when this tab was last accessed"
                        },
                        "current_index": {
                            "type": "integer",
                            "description": "Current position in the tab's history"
                        },
                        "current_title": {
                            "type": "string",
                            "description": "Title of the current page"
                        },
                        "current_url": {
                            "type": "string",
                            "format": "uri",
                            "description": "Full URL of the current page"
                        },
                        "domain": {
                            "type": "string",
                            "description": "Domain name of the current page (without www prefix)"
                        },
                        "history": {
                            "type": "array",
                            "description": "Navigation history of this tab",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "entry_index": {
                                        "type": "integer",
                                        "description": "Position of this entry in the history"
                                    },
                                    "title": {
                                        "type": "string",
                                        "description": "Page title"
                                    },
                                    "url": {
                                        "type": "string",
                                        "format": "uri",
                                        "description": "Full page URL"
                                    },
                                    "is_current": {
                                        "type": "boolean",
                                        "description": "Whether this is the currently visible page in the tab"
                                    },
                                    "accessed_at": {
                                        "type": ["string", "null"],
                                        "format": "date-time",
                                        "description": "ISO8601 timestamp of when this history entry was accessed"
                                    },
                                    "scroll_position": {
                                        "type": "object",
                                        "description": "Scroll position in pixels",
                                        "properties": {
                                            "x": {
                                                "type": "integer",
                                                "description": "Horizontal scroll position in pixels"
                                            },
                                            "y": {
                                                "type": "integer",
                                                "description": "Vertical scroll position in pixels"
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    print(json.dumps(schema, indent=2))
    print("\nExample output structure:")

    example = {
        "export_date": "2025-05-17T10:30:45.123456",
        "total_tabs": 2,
        "tabs": [
            {
                "window_index": 0,
                "tab_index": 0,
                "last_accessed": 1681943790848,
                "current_index": 1,
                "current_title": "Example Page",
                "current_url": "https://example.com/page",
                "domain": "example.com",
                "history": [
                    {
                        "entry_index": 0,
                        "title": "Example Home",
                        "url": "https://example.com/",
                        "is_current": False,
                        "accessed_at": "2025-05-17T10:10:25.123456",
                        "scroll_position": {
                            "x": 0,
                            "y": 0
                        }
                    },
                    {
                        "entry_index": 1,
                        "title": "Example Page",
                        "url": "https://example.com/page",
                        "is_current": True,
                        "accessed_at": "2025-05-17T10:15:30.123456",
                        "scroll_position": {
                            "x": 0,
                            "y": 1250
                        }
                    }
                ]
            },
            {
                "window_index": 0,
                "tab_index": 1,
                "last_accessed": 1681943790848,
                "current_index": 0,
                "current_title": "Another Page",
                "current_url": "https://another-example.org/",
                "domain": "another-example.org",
                "history": [
                    {
                        "entry_index": 0,
                        "title": "Another Page",
                        "url": "https://another-example.org/",
                        "is_current": True,
                        "accessed_at": "2025-05-17T10:20:15.123456",
                        "scroll_position": {
                            "x": 0,
                            "y": 500
                        }
                    }
                ]
            }
        ]
    }

    print(json.dumps(example, indent=2))

def main():
    args = parse_arguments()

    # If show-schema flag is set, display schema and exit
    if args.show_schema:
        display_schema()
        sys.exit(0)

    files = get_session_files(args.mozilla_path, args.profile)
    all_tabs = []

    for f in files:
        if not f.exists() or not f.is_file():
            print(f"Skipping invalid or inaccessible file: {f}")
            continue

        data = read_and_decompress_file(f)
        if data is None:
            continue

        session_data = parse_json_data(data, f)
        if session_data is None:
            continue

        tabs = extract_tabs_from_session(session_data, f)
        if tabs:
            all_tabs.extend(tabs)

    # Create final output with metadata
    output_data = {
        "export_date": datetime.datetime.now().isoformat(),
        "total_tabs": len(all_tabs),
        "tabs": all_tabs
    }

    # Write to JSON file
    with open(args.output, 'w', encoding='utf-8') as f:
        json.dump(output_data, f, indent=2, ensure_ascii=False)

    print(f"Exported {len(all_tabs)} tabs to {args.output}")

if __name__ == "__main__":
    main()

	#! /usr/bin/env python3

	"""
	List all Firefox tabs with title and URL

	Supported input: json or jsonlz4 recovery files
	Default output: title (URL)
	Output format can be specified as argument
	"""

	import platform
	import sys
	import pathlib
	import lz4.block
	import json

	if platform.system() == 'Windows':
	path = pathlib.Path(os.environ['APPDATA']).joinpath('Mozilla\\Firefox\\Profiles')
	elif platform.system() == 'Darwin':
	path = pathlib.Path.home().joinpath('Library/Application Support/Firefox/Profiles')
	else:
	path = pathlib.Path.home().joinpath('.mozilla/firefox')
	files = path.glob('default/sessionstore-backups/recovery.js*')

	try:
	template = sys.argv[1]
	except IndexError:
	template = '%s (%s)'

	for f in files:
	b = f.read_bytes()
	if b[:8] == b'mozLz40\0':
	b = lz4.block.decompress(b[8:])
	j = json.loads(b)
	for w in j['windows']:
	for t in w['tabs']:
	i = t['index'] - 1
	if len(t['entries']) > i:
	print(template % (
	t['entries'][i]['title'],
	t['entries'][i]['url']
	))

tmonjalo/list-fftabs.py

kth8 commented Oct 2, 2024

Uh oh!

mabra commented Dec 9, 2024

Uh oh!

Joshfindit commented May 17, 2025

Uh oh!