Created
April 5, 2019 22:50
-
-
Save larshb/4426d7d9014fa2d4ad6525a72880efe3 to your computer and use it in GitHub Desktop.
Scrabe apartment data based on id (Finn-kode) from finn.no
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request, re | |
from pprint import pprint | |
import os | |
# Apartment codes (Finn-kode) | |
codes = [143511428, 140785196] | |
FINNROOT = "https://www.finn.no/realestate/homes/ad.html?finnkode=" | |
finnLink = lambda code : FINNROOT + str(code) | |
def cleanHTML(htmlString): | |
htmlString = str(htmlString) | |
ignore = "\r,\n,\\xc2,\\xa0".split(',') | |
for ch in ignore: | |
htmlString = htmlString.replace(ch, '') | |
subs = [ | |
('\\xc3\\xa6', 'æ'), | |
('\\xc3\\xa5', 'å') | |
] | |
for a, b in subs: | |
htmlString = htmlString.replace(a, b) | |
return htmlString | |
def getParams(code): | |
contents = urllib.request.urlopen(finnLink(code)).read() | |
plaintext = cleanHTML(contents) | |
# Regex descriptors | |
paramsAndPatterns = [ | |
("title", r"class\=\"u\-t2[^>]*>([^<]*)"), | |
("price", r"Prisantydning.*t3[^\d]+(.*?) kr"), # prisantydning | |
("address", r"<h3>(.*)</h3.*?<a href=\"https://kart"), | |
("img", r"<a href.*?<img.*?8px.*?src=\"([^\"]*)") | |
] | |
params = {} | |
for param, pattern in paramsAndPatterns: | |
#print("Matching %s"%pattern) | |
matches = re.findall(pattern, plaintext) | |
if len(matches) != 1: | |
print("WARNING %s matched multiple parameters"%param) | |
#print(matches) | |
if len(matches) > 0: | |
params[param] = matches[0] | |
params.update(getTableParams(plaintext)) | |
return params | |
def innerHtml(html, tag): | |
'''returns (only) the first element''' | |
ret = '' | |
start = "<%s>"%tag | |
end = "</%s>"%tag | |
if start in html and end in html: | |
ret = html.split(start)[1].split(end)[0] | |
return str(ret) | |
def getTableParams(plaintext): | |
match = re.findall(r"<dl class.*?Boligtype.*?dl>", plaintext) | |
params = {} | |
if len(match) != 1: | |
print("WARNING: Unable to fetch table parameters") | |
print(str(len(match)) + " regex matches") | |
else: | |
#parts = match[0].split('<\n>') | |
#parts = innerHtml(match[0], 'dd') | |
parts = [s+'</dd>' for s in match[0].split('</dd>')] | |
#pprint(parts) | |
for part in parts: | |
name = innerHtml(part, 'dt') | |
val = innerHtml(part, 'dd') | |
if name != '' and val != '': | |
params[name] = val | |
#print((name, val)) | |
return params | |
print("Scraping") | |
keys = set() | |
codeParams = [] | |
for code in codes: | |
params = getParams(code) | |
for key in params.keys(): keys.add(key) | |
codeParams.append((code, params)) | |
#pprint(params) | |
print("Done scraping") | |
# Choose keys | |
#keylist = list(keys) | |
keylist = 'img,Boligtype,Eieform,Totalpris,Bruksareal,Felleskost/mnd.'.split(',') | |
# Build table | |
table = [['code'] + keylist] | |
for code, params in codeParams: | |
line = [str(code)] | |
for key in keylist: | |
if key in params: | |
line.append(params[key]) | |
else: | |
line.append('') | |
table.append(line) | |
#print(table) | |
# Build csv | |
DELIM = ';' | |
csv = 'code'+DELIM+DELIM.join(keylist) | |
for code, params in codeParams: | |
line = str(code) + DELIM | |
for key in keylist: | |
if key in params: | |
line+=params[key] | |
line+=DELIM | |
csv += '\n' + line[:-1] | |
#print(csv) | |
#open('finn.csv','w').write(csv) | |
# Translate into read friendly keys | |
def translate(key): | |
print(key) | |
subs = { | |
'title': 'Beskrivelse', | |
'img': 'Bilde', | |
'price': 'Prisantydning', | |
'code': 'Finn-kode' | |
} | |
if key in subs: | |
print('\akey') | |
return subs[key] | |
return key | |
# Build HTML | |
html = \ | |
''' | |
<link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons"> | |
<link rel="stylesheet" href="https://code.getmdl.io/1.3.0/material.indigo-pink.min.css"> | |
<script defer src="https://code.getmdl.io/1.3.0/material.min.js"></script> | |
<style> | |
img { | |
height: 10em; | |
object-fit: cover; | |
width: 14em; | |
border-radius: 8px; | |
} | |
td { | |
max-width: 200px; | |
} | |
</style> | |
''' | |
html += '<table class=\"mdl-data-table mdl-js-data-table\">' | |
td = ('<th>', '</th>') # header | |
for line in table: | |
html += '<tr>' | |
for cell in line: | |
html += td[0] | |
if len(cell) > 4 and cell[-4:] == '.jpg': | |
html += '<img src=\"' + cell + '\">' | |
else: | |
html += translate(cell) | |
html += td[1] | |
html += '</tr>' | |
td = ('<td>', '</td>') | |
html += '</table>' | |
open('finn.htm','w').write(html) | |
# Build class | |
pprint(keylist) | |
class FinnApartment: | |
def __init__(self, code, params): | |
self.code = code | |
if 'img' in params: | |
self.img = params['img'] | |
if 'Energimerking' in params: | |
pass#self.energy = params['Energimerking'] # Bugged | |
if 'Felleskost/mnd.' in params: | |
self.sharedcosts = params['Felleskost/mnd.'] | |
if 'price' in params: | |
self.price = params['price'] | |
if 'Rom' in params: | |
self.rooms = params['Rom'] | |
if 'Primærrom' in params: | |
self.area = params['Primærrom'] | |
if 'Eieform' in params: | |
self.own = params['Eieform'] | |
#orker ikke | |
for code, params in codeParams: | |
ap = FinnApartment(code, params) | |
#pprint(vars(ap)) | |
# Windows | |
#os.system('start finn.csv') | |
os.system('start finn.htm') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment