Created
January 31, 2026 15:55
-
-
Save kohlerdominik/57b1fd9d49e02511efccdca3192c8a93 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| <# | |
| .SYNOPSIS | |
| Parse Gemweb/CityWeb Directory Pages - Extract company data from Swiss municipality directories | |
| .DESCRIPTION | |
| This script extracts company listings from Gemweb/CityWeb directory websites (KantoneWeb platform by i-web.ch). | |
| These are commonly used by Swiss municipalities for their business directories. | |
| The script reads the data-entities JSON attribute from the directory page, parses it, and fetches detailed | |
| address information from each company's detail page. Output is a tab-separated TSV file ready for analysis. | |
| .PARAMETER PageUrl | |
| The full URL to the Gemweb/CityWeb directory page (e.g., https://www.innertkirchen.ch/firmenverzeichnis) | |
| The page must contain the data-entities attribute with company data. | |
| .PARAMETER OutputFile | |
| The output file path for the TSV results (e.g., c:\data\companies.txt) | |
| .EXAMPLE | |
| PowerShell -ExecutionPolicy Bypass -File gemweb_directory_scraper.ps1 ` | |
| -PageUrl "https://www.innertkirchen.ch/firmenverzeichnis" ` | |
| -OutputFile "c:\Users\Dominik\meiringen\innertkirchen_kontaktliste.txt" | |
| .EXAMPLE | |
| PowerShell -ExecutionPolicy Bypass -File gemweb_directory_scraper.ps1 ` | |
| -PageUrl "https://example.ch/firmenverzeichnis" ` | |
| -OutputFile "c:\data\companies_list.txt" | |
| .OUTPUT | |
| Tab-separated values file with columns: | |
| - id (sequential number) | |
| - name (company name) | |
| - ort (municipality/city) | |
| - telfon (phone number) | |
| - email (email address) | |
| - branche (business category) | |
| - address (full postal address) | |
| .NOTES | |
| Platform: Gemweb/CityWeb (KantoneWeb) by i-web.ch | |
| - The script includes 150ms delay between requests to be respectful to servers | |
| - Progress is shown as [current/total] during address fetching | |
| - Only works with Gemweb/CityWeb directory pages that expose data-entities attribute | |
| - Common Swiss municipalities: Innertkirchen, Brienz, Hasliberg, Meiringen, etc. | |
| .LINK | |
| https://www.i-web.ch - i-web.ch platform provider | |
| #> | |
| param( | |
| [Parameter(Mandatory=$true)] | |
| [string]$PageUrl, | |
| [Parameter(Mandatory=$true)] | |
| [string]$OutputFile | |
| ) | |
| # Fetch the page | |
| Write-Host "Fetching $PageUrl..." | |
| $response = Invoke-WebRequest -Uri $PageUrl -UseBasicParsing | |
| # Extract data-entities attribute | |
| $pattern = 'data-entities="([^"]*)"' | |
| $match = [regex]::Match($response.Content, $pattern) | |
| if (-not $match.Success) { | |
| Write-Error "Could not find data-entities attribute in page. Please verify this is a Gemweb/CityWeb directory page." | |
| exit 1 | |
| } | |
| # Decode HTML entities from the attribute value | |
| $jsonString = $match.Groups[1].Value | |
| $jsonString = [System.Net.WebUtility]::HtmlDecode($jsonString) | |
| Write-Host "Parsing JSON data..." | |
| $data = $jsonString | ConvertFrom-Json | |
| # Process and output the data | |
| $results = @() | |
| $id = 1 | |
| $totalCount = $data.data.Count | |
| foreach ($item in $data.data) { | |
| # Extract company name from HTML link | |
| $nameMatch = [regex]::Match($item.name, '>([^<]+)<') | |
| $name = if ($nameMatch.Success) { $nameMatch.Groups[1].Value } else { $item.name } | |
| # Extract company ID from link | |
| $idMatch = [regex]::Match($item.name, '/unternehmen/(\d+)') | |
| $companyId = if ($idMatch.Success) { $idMatch.Groups[1].Value } else { "" } | |
| # Extract phone from HTML link | |
| $phoneMatch = [regex]::Match($item.telefon, '>([^<]+)<') | |
| $phone = if ($phoneMatch.Success) { $phoneMatch.Groups[1].Value.Trim() } else { "" } | |
| # Extract email from HTML link | |
| $emailMatch = [regex]::Match($item.email, '>([^<]+)<') | |
| $email = if ($emailMatch.Success) { $emailMatch.Groups[1].Value } else { "" } | |
| # Fetch address from detail page | |
| $address = "" | |
| if ($companyId) { | |
| Write-Host "[$id/$totalCount] Fetching address for $name (ID: $companyId)..." | |
| try { | |
| $detailUrl = "https://www.innertkirchen.ch/_rte/unternehmen/$companyId" | |
| $detailResponse = Invoke-WebRequest -Uri $detailUrl -UseBasicParsing -ErrorAction Stop | |
| # Extract from <address class="icms-contact-container"> | |
| $addressPattern = '<address[^>]*class="icms-contact-container"[^>]*>(.*?)</address>' | |
| $addressMatch = [regex]::Match($detailResponse.Content, $addressPattern, [System.Text.RegularExpressions.RegexOptions]::Singleline) | |
| if ($addressMatch.Success) { | |
| $addressHtml = $addressMatch.Groups[1].Value | |
| # Remove HTML tags | |
| $addressHtml = [regex]::Replace($addressHtml, '<[^>]+>', '|') | |
| # Remove extra whitespace and newlines | |
| $addressHtml = [regex]::Replace($addressHtml, '\s+', ' ') | |
| # Split by | and filter empty lines | |
| $lines = $addressHtml -split '\|' | Where-Object { $_ -match '\S' } | ForEach-Object { $_.Trim() } | |
| # Rejoin first few lines as address (skip company name, usually first line) | |
| $addressLines = $lines | Select-Object -Skip 1 | Select-Object -First 3 | |
| $address = ($addressLines -join ', ') | |
| # Clean up: remove "Tel.", "Fax", phone links, email text | |
| $address = [regex]::Replace($address, '\s*Tel\.?\s*', '') | |
| $address = [regex]::Replace($address, '\s*Fax\.?\s*', '') | |
| $address = [regex]::Replace($address, '\s*0\d{1,3}\s+\d{3}\s+\d{2}\s+\d{2}', '') | |
| # Remove trailing commas and spaces | |
| $address = $address -replace ',\s*$', '' | |
| $address = $address.Trim() | |
| } | |
| Start-Sleep -Milliseconds 150 # Rate limiting | |
| } | |
| catch { | |
| Write-Warning "Could not fetch address for $name" | |
| } | |
| } | |
| $results += @{ | |
| id = $id | |
| name = $name | |
| ort = $item.ort | |
| telefon = $phone | |
| email = $email | |
| branche = $item.brancheId | |
| address = $address | |
| } | |
| $id++ | |
| } | |
| # Output as tab-separated values | |
| Write-Host "Writing to $OutputFile..." | |
| $output = @() | |
| $output += "id`tname`tort`ttelfon`temail`tbranche`taddress" | |
| foreach ($item in $results) { | |
| $line = "$($item.id)`t$($item.name)`t$($item.ort)`t$($item.telefon)`t$($item.email)`t$($item.branche)`t$($item.address)" | |
| $output += $line | |
| } | |
| $output | Out-File -FilePath $OutputFile -Encoding UTF8 | |
| Write-Host "Done! Parsed $($results.Count) companies to $OutputFile" | |
| Write-Host "Addresses have been extracted from detail pages" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment