Skip to content

Instantly share code, notes, and snippets.

@larshb
Created April 5, 2019 22:50
Show Gist options
  • Save larshb/4426d7d9014fa2d4ad6525a72880efe3 to your computer and use it in GitHub Desktop.
Save larshb/4426d7d9014fa2d4ad6525a72880efe3 to your computer and use it in GitHub Desktop.
Scrabe apartment data based on id (Finn-kode) from finn.no
import urllib.request, re
from pprint import pprint
import os
# Apartment codes (Finn-kode)
codes = [143511428, 140785196]
FINNROOT = "https://www.finn.no/realestate/homes/ad.html?finnkode="
finnLink = lambda code : FINNROOT + str(code)
def cleanHTML(htmlString):
htmlString = str(htmlString)
ignore = "\r,\n,\\xc2,\\xa0".split(',')
for ch in ignore:
htmlString = htmlString.replace(ch, '')
subs = [
('\\xc3\\xa6', 'æ'),
('\\xc3\\xa5', 'å')
]
for a, b in subs:
htmlString = htmlString.replace(a, b)
return htmlString
def getParams(code):
contents = urllib.request.urlopen(finnLink(code)).read()
plaintext = cleanHTML(contents)
# Regex descriptors
paramsAndPatterns = [
("title", r"class\=\"u\-t2[^>]*>([^<]*)"),
("price", r"Prisantydning.*t3[^\d]+(.*?) kr"), # prisantydning
("address", r"<h3>(.*)</h3.*?<a href=\"https://kart"),
("img", r"<a href.*?<img.*?8px.*?src=\"([^\"]*)")
]
params = {}
for param, pattern in paramsAndPatterns:
#print("Matching %s"%pattern)
matches = re.findall(pattern, plaintext)
if len(matches) != 1:
print("WARNING %s matched multiple parameters"%param)
#print(matches)
if len(matches) > 0:
params[param] = matches[0]
params.update(getTableParams(plaintext))
return params
def innerHtml(html, tag):
'''returns (only) the first element'''
ret = ''
start = "<%s>"%tag
end = "</%s>"%tag
if start in html and end in html:
ret = html.split(start)[1].split(end)[0]
return str(ret)
def getTableParams(plaintext):
match = re.findall(r"<dl class.*?Boligtype.*?dl>", plaintext)
params = {}
if len(match) != 1:
print("WARNING: Unable to fetch table parameters")
print(str(len(match)) + " regex matches")
else:
#parts = match[0].split('<\n>')
#parts = innerHtml(match[0], 'dd')
parts = [s+'</dd>' for s in match[0].split('</dd>')]
#pprint(parts)
for part in parts:
name = innerHtml(part, 'dt')
val = innerHtml(part, 'dd')
if name != '' and val != '':
params[name] = val
#print((name, val))
return params
print("Scraping")
keys = set()
codeParams = []
for code in codes:
params = getParams(code)
for key in params.keys(): keys.add(key)
codeParams.append((code, params))
#pprint(params)
print("Done scraping")
# Choose keys
#keylist = list(keys)
keylist = 'img,Boligtype,Eieform,Totalpris,Bruksareal,Felleskost/mnd.'.split(',')
# Build table
table = [['code'] + keylist]
for code, params in codeParams:
line = [str(code)]
for key in keylist:
if key in params:
line.append(params[key])
else:
line.append('')
table.append(line)
#print(table)
# Build csv
DELIM = ';'
csv = 'code'+DELIM+DELIM.join(keylist)
for code, params in codeParams:
line = str(code) + DELIM
for key in keylist:
if key in params:
line+=params[key]
line+=DELIM
csv += '\n' + line[:-1]
#print(csv)
#open('finn.csv','w').write(csv)
# Translate into read friendly keys
def translate(key):
print(key)
subs = {
'title': 'Beskrivelse',
'img': 'Bilde',
'price': 'Prisantydning',
'code': 'Finn-kode'
}
if key in subs:
print('\akey')
return subs[key]
return key
# Build HTML
html = \
'''
<link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
<link rel="stylesheet" href="https://code.getmdl.io/1.3.0/material.indigo-pink.min.css">
<script defer src="https://code.getmdl.io/1.3.0/material.min.js"></script>
<style>
img {
height: 10em;
object-fit: cover;
width: 14em;
border-radius: 8px;
}
td {
max-width: 200px;
}
</style>
'''
html += '<table class=\"mdl-data-table mdl-js-data-table\">'
td = ('<th>', '</th>') # header
for line in table:
html += '<tr>'
for cell in line:
html += td[0]
if len(cell) > 4 and cell[-4:] == '.jpg':
html += '<img src=\"' + cell + '\">'
else:
html += translate(cell)
html += td[1]
html += '</tr>'
td = ('<td>', '</td>')
html += '</table>'
open('finn.htm','w').write(html)
# Build class
pprint(keylist)
class FinnApartment:
def __init__(self, code, params):
self.code = code
if 'img' in params:
self.img = params['img']
if 'Energimerking' in params:
pass#self.energy = params['Energimerking'] # Bugged
if 'Felleskost/mnd.' in params:
self.sharedcosts = params['Felleskost/mnd.']
if 'price' in params:
self.price = params['price']
if 'Rom' in params:
self.rooms = params['Rom']
if 'Primærrom' in params:
self.area = params['Primærrom']
if 'Eieform' in params:
self.own = params['Eieform']
#orker ikke
for code, params in codeParams:
ap = FinnApartment(code, params)
#pprint(vars(ap))
# Windows
#os.system('start finn.csv')
os.system('start finn.htm')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment