kohlerdominik · January 31, 2026 15:55
diff --git a/gemweb_directory_scraper.ps1 b/gemweb_directory_scraper.ps1
 <#
 .SYNOPSIS
    Parse Gemweb/CityWeb Directory Pages - Extract company data from Swiss municipality directories

 .DESCRIPTION
    This script extracts company listings from Gemweb/CityWeb directory websites (KantoneWeb platform by i-web.ch).
    These are commonly used by Swiss municipalities for their business directories.
    
    The script reads the data-entities JSON attribute from the directory page, parses it, and fetches detailed
    address information from each company's detail page. Output is a tab-separated TSV file ready for analysis.

 .PARAMETER PageUrl
    The full URL to the Gemweb/CityWeb directory page (e.g., https://www.innertkirchen.ch/firmenverzeichnis)
    The page must contain the data-entities attribute with company data.

 .PARAMETER OutputFile
    The output file path for the TSV results (e.g., c:\data\companies.txt)

 .EXAMPLE
    PowerShell -ExecutionPolicy Bypass -File gemweb_directory_scraper.ps1 `
        -PageUrl "https://www.innertkirchen.ch/firmenverzeichnis" `
        -OutputFile "c:\Users\Dominik\meiringen\innertkirchen_kontaktliste.txt"

 .EXAMPLE
    PowerShell -ExecutionPolicy Bypass -File gemweb_directory_scraper.ps1 `
        -PageUrl "https://example.ch/firmenverzeichnis" `
        -OutputFile "c:\data\companies_list.txt"

 .OUTPUT
    Tab-separated values file with columns:
    - id (sequential number)
    - name (company name)
    - ort (municipality/city)
    - telfon (phone number)
    - email (email address)
    - branche (business category)
    - address (full postal address)

 .NOTES
    Platform: Gemweb/CityWeb (KantoneWeb) by i-web.ch
    - The script includes 150ms delay between requests to be respectful to servers
    - Progress is shown as [current/total] during address fetching
    - Only works with Gemweb/CityWeb directory pages that expose data-entities attribute
    - Common Swiss municipalities: Innertkirchen, Brienz, Hasliberg, Meiringen, etc.

 .LINK
    https://www.i-web.ch - i-web.ch platform provider
 #>

 param(
    [Parameter(Mandatory=$true)]
    [string]$PageUrl,
    
    [Parameter(Mandatory=$true)]
    [string]$OutputFile
 )

 # Fetch the page
 Write-Host "Fetching $PageUrl..."
 $response = Invoke-WebRequest -Uri $PageUrl -UseBasicParsing

 # Extract data-entities attribute
 $pattern = 'data-entities="([^"]*)"'
 $match = [regex]::Match($response.Content, $pattern)

 if (-not $match.Success) {
    Write-Error "Could not find data-entities attribute in page. Please verify this is a Gemweb/CityWeb directory page."
    exit 1
 }

 # Decode HTML entities from the attribute value
 $jsonString = $match.Groups[1].Value
 $jsonString = [System.Net.WebUtility]::HtmlDecode($jsonString)

 Write-Host "Parsing JSON data..."
 $data = $jsonString | ConvertFrom-Json

 # Process and output the data
 $results = @()
 $id = 1
 $totalCount = $data.data.Count

 foreach ($item in $data.data) {
    # Extract company name from HTML link
    $nameMatch = [regex]::Match($item.name, '>([^<]+)<')
    $name = if ($nameMatch.Success) { $nameMatch.Groups[1].Value } else { $item.name }
    
    # Extract company ID from link
    $idMatch = [regex]::Match($item.name, '/unternehmen/(\d+)')
    $companyId = if ($idMatch.Success) { $idMatch.Groups[1].Value } else { "" }
    
    # Extract phone from HTML link
    $phoneMatch = [regex]::Match($item.telefon, '>([^<]+)<')
    $phone = if ($phoneMatch.Success) { $phoneMatch.Groups[1].Value.Trim() } else { "" }
    
    # Extract email from HTML link
    $emailMatch = [regex]::Match($item.email, '>([^<]+)<')
    $email = if ($emailMatch.Success) { $emailMatch.Groups[1].Value } else { "" }
    
    # Fetch address from detail page
    $address = ""
    if ($companyId) {
        Write-Host "[$id/$totalCount] Fetching address for $name (ID: $companyId)..."
        try {
            $detailUrl = "https://www.innertkirchen.ch/_rte/unternehmen/$companyId"
            $detailResponse = Invoke-WebRequest -Uri $detailUrl -UseBasicParsing -ErrorAction Stop
            
            # Extract from <address class="icms-contact-container">
            $addressPattern = '<address[^>]*class="icms-contact-container"[^>]*>(.*?)</address>'
            $addressMatch = [regex]::Match($detailResponse.Content, $addressPattern, [System.Text.RegularExpressions.RegexOptions]::Singleline)
            
            if ($addressMatch.Success) {
                $addressHtml = $addressMatch.Groups[1].Value
                # Remove HTML tags
                $addressHtml = [regex]::Replace($addressHtml, '<[^>]+>', '|')
                # Remove extra whitespace and newlines
                $addressHtml = [regex]::Replace($addressHtml, '\s+', ' ')
                # Split by | and filter empty lines
                $lines = $addressHtml -split '\|' | Where-Object { $_ -match '\S' } | ForEach-Object { $_.Trim() }
                # Rejoin first few lines as address (skip company name, usually first line)
                $addressLines = $lines | Select-Object -Skip 1 | Select-Object -First 3
                $address = ($addressLines -join ', ')
                # Clean up: remove "Tel.", "Fax", phone links, email text
                $address = [regex]::Replace($address, '\s*Tel\.?\s*', '')
                $address = [regex]::Replace($address, '\s*Fax\.?\s*', '')
                $address = [regex]::Replace($address, '\s*0\d{1,3}\s+\d{3}\s+\d{2}\s+\d{2}', '')
                # Remove trailing commas and spaces
                $address = $address -replace ',\s*$', ''
                $address = $address.Trim()
            }
            
            Start-Sleep -Milliseconds 150  # Rate limiting
        }
        catch {
            Write-Warning "Could not fetch address for $name"
        }
    }
    
    $results += @{
        id = $id
        name = $name
        ort = $item.ort
        telefon = $phone
        email = $email
        branche = $item.brancheId
        address = $address
    }
    
    $id++
 }

 # Output as tab-separated values
 Write-Host "Writing to $OutputFile..."
 $output = @()
 $output += "id`tname`tort`ttelfon`temail`tbranche`taddress"

 foreach ($item in $results) {
    $line = "$($item.id)`t$($item.name)`t$($item.ort)`t$($item.telefon)`t$($item.email)`t$($item.branche)`t$($item.address)"
    $output += $line
 }

 $output | Out-File -FilePath $OutputFile -Encoding UTF8

 Write-Host "Done! Parsed $($results.Count) companies to $OutputFile"
 Write-Host "Addresses have been extracted from detail pages"
	<#
	.SYNOPSIS
	Parse Gemweb/CityWeb Directory Pages - Extract company data from Swiss municipality directories

	.DESCRIPTION
	This script extracts company listings from Gemweb/CityWeb directory websites (KantoneWeb platform by i-web.ch).
	These are commonly used by Swiss municipalities for their business directories.

	The script reads the data-entities JSON attribute from the directory page, parses it, and fetches detailed
	address information from each company's detail page. Output is a tab-separated TSV file ready for analysis.

	.PARAMETER PageUrl
	The full URL to the Gemweb/CityWeb directory page (e.g., https://www.innertkirchen.ch/firmenverzeichnis)
	The page must contain the data-entities attribute with company data.

	.PARAMETER OutputFile
	The output file path for the TSV results (e.g., c:\data\companies.txt)

	.EXAMPLE
	PowerShell -ExecutionPolicy Bypass -File gemweb_directory_scraper.ps1 `
	-PageUrl "https://www.innertkirchen.ch/firmenverzeichnis" `
	-OutputFile "c:\Users\Dominik\meiringen\innertkirchen_kontaktliste.txt"

	.EXAMPLE
	PowerShell -ExecutionPolicy Bypass -File gemweb_directory_scraper.ps1 `
	-PageUrl "https://example.ch/firmenverzeichnis" `
	-OutputFile "c:\data\companies_list.txt"

	.OUTPUT
	Tab-separated values file with columns:
	- id (sequential number)
	- name (company name)
	- ort (municipality/city)
	- telfon (phone number)
	- email (email address)
	- branche (business category)
	- address (full postal address)

	.NOTES
	Platform: Gemweb/CityWeb (KantoneWeb) by i-web.ch
	- The script includes 150ms delay between requests to be respectful to servers
	- Progress is shown as [current/total] during address fetching
	- Only works with Gemweb/CityWeb directory pages that expose data-entities attribute
	- Common Swiss municipalities: Innertkirchen, Brienz, Hasliberg, Meiringen, etc.

	.LINK
	https://www.i-web.ch - i-web.ch platform provider
	#>

	param(
	[Parameter(Mandatory=$true)]
	[string]$PageUrl,

	[Parameter(Mandatory=$true)]
	[string]$OutputFile
	)

	# Fetch the page
	Write-Host "Fetching $PageUrl..."
	$response = Invoke-WebRequest -Uri $PageUrl -UseBasicParsing

	# Extract data-entities attribute
	$pattern = 'data-entities="([^"]*)"'
	$match = [regex]::Match($response.Content, $pattern)

	if (-not $match.Success) {
	Write-Error "Could not find data-entities attribute in page. Please verify this is a Gemweb/CityWeb directory page."
	exit 1
	}

	# Decode HTML entities from the attribute value
	$jsonString = $match.Groups[1].Value
	$jsonString = [System.Net.WebUtility]::HtmlDecode($jsonString)

	Write-Host "Parsing JSON data..."
	$data = $jsonString \| ConvertFrom-Json

	# Process and output the data
	$results = @()
	$id = 1
	$totalCount = $data.data.Count

	foreach ($item in $data.data) {
	# Extract company name from HTML link
	$nameMatch = [regex]::Match($item.name, '>([^<]+)<')
	$name = if ($nameMatch.Success) { $nameMatch.Groups[1].Value } else { $item.name }

	# Extract company ID from link
	$idMatch = [regex]::Match($item.name, '/unternehmen/(\d+)')
	$companyId = if ($idMatch.Success) { $idMatch.Groups[1].Value } else { "" }

	# Extract phone from HTML link
	$phoneMatch = [regex]::Match($item.telefon, '>([^<]+)<')
	$phone = if ($phoneMatch.Success) { $phoneMatch.Groups[1].Value.Trim() } else { "" }

	# Extract email from HTML link
	$emailMatch = [regex]::Match($item.email, '>([^<]+)<')
	$email = if ($emailMatch.Success) { $emailMatch.Groups[1].Value } else { "" }

	# Fetch address from detail page
	$address = ""
	if ($companyId) {
	Write-Host "[$id/$totalCount] Fetching address for $name (ID: $companyId)..."
	try {
	$detailUrl = "https://www.innertkirchen.ch/_rte/unternehmen/$companyId"
	$detailResponse = Invoke-WebRequest -Uri $detailUrl -UseBasicParsing -ErrorAction Stop

	# Extract from <address class="icms-contact-container">
	$addressPattern = '<address[^>]class="icms-contact-container"[^>]>(.*?)</address>'
	$addressMatch = [regex]::Match($detailResponse.Content, $addressPattern, [System.Text.RegularExpressions.RegexOptions]::Singleline)

	if ($addressMatch.Success) {
	$addressHtml = $addressMatch.Groups[1].Value
	# Remove HTML tags
	$addressHtml = [regex]::Replace($addressHtml, '<[^>]+>', '\|')
	# Remove extra whitespace and newlines
	$addressHtml = [regex]::Replace($addressHtml, '\s+', ' ')
	# Split by \| and filter empty lines
	$lines = $addressHtml -split '\\|' \| Where-Object { $_ -match '\S' } \| ForEach-Object { $_.Trim() }
	# Rejoin first few lines as address (skip company name, usually first line)
	$addressLines = $lines \| Select-Object -Skip 1 \| Select-Object -First 3
	$address = ($addressLines -join ', ')
	# Clean up: remove "Tel.", "Fax", phone links, email text
	$address = [regex]::Replace($address, '\sTel\.?\s', '')
	$address = [regex]::Replace($address, '\sFax\.?\s', '')
	$address = [regex]::Replace($address, '\s*0\d{1,3}\s+\d{3}\s+\d{2}\s+\d{2}', '')
	# Remove trailing commas and spaces
	$address = $address -replace ',\s*$', ''
	$address = $address.Trim()
	}

	Start-Sleep -Milliseconds 150 # Rate limiting
	}
	catch {
	Write-Warning "Could not fetch address for $name"
	}
	}

	$results += @{
	id = $id
	name = $name
	ort = $item.ort
	telefon = $phone
	email = $email
	branche = $item.brancheId
	address = $address
	}

	$id++
	}

	# Output as tab-separated values
	Write-Host "Writing to $OutputFile..."
	$output = @()
	$output += "id`tname`tort`ttelfon`temail`tbranche`taddress"

	foreach ($item in $results) {
	$line = "$($item.id)`t$($item.name)`t$($item.ort)`t$($item.telefon)`t$($item.email)`t$($item.branche)`t$($item.address)"
	$output += $line
	}

	$output \| Out-File -FilePath $OutputFile -Encoding UTF8

	Write-Host "Done! Parsed $($results.Count) companies to $OutputFile"
	Write-Host "Addresses have been extracted from detail pages"
No results found