Last active
November 7, 2019 14:52
-
-
Save motanelu/dfbda0beecd000c04e2a to your computer and use it in GitHub Desktop.
Finding a house with python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
import peewee | |
import re | |
import urllib | |
import cStringIO | |
from PIL import Image | |
from playhouse.db_url import connect | |
db = connect('mysql://root:@127.0.0.1/house') | |
def dhash(image, hash_size = 8): | |
image = image.convert('L').resize( | |
(hash_size + 1, hash_size), | |
Image.ANTIALIAS, | |
) | |
pixels = list(image.getdata()) | |
difference = [] | |
for row in xrange(hash_size): | |
for col in xrange(hash_size): | |
pixel_left = image.getpixel((col, row)) | |
pixel_right = image.getpixel((col + 1, row)) | |
difference.append(pixel_left > pixel_right) | |
decimal_value = 0 | |
hex_string = [] | |
for index, value in enumerate(difference): | |
if value: | |
decimal_value += 2**(index % 8) | |
if (index % 8) == 7: | |
hex_string.append(hex(decimal_value)[2:].rjust(2, '0')) | |
decimal_value = 0 | |
return ''.join(hex_string) | |
def hamming_distance(s1, s2): | |
if len(s1) != len(s2): | |
raise ValueError('Undefined for sequences of unequal length') | |
return sum(ch1 != ch2 for ch1, ch2 in zip(s1, s2)) | |
class House(peewee.Model): | |
bathrooms = peewee.IntegerField() | |
floor = peewee.IntegerField() | |
price = peewee.IntegerField() | |
rooms = peewee.IntegerField() | |
external = peewee.IntegerField(db_column='external_id', unique=True) | |
rejected = peewee.IntegerField(index=True, null=True) | |
street = peewee.TextField() | |
url = peewee.CharField() | |
class Meta: | |
database = db | |
class Picture(peewee.Model): | |
hash = peewee.CharField() | |
house = peewee.ForeignKeyField(House, related_name='pictures') | |
class Meta: | |
database = db | |
class HouseSpider(scrapy.Spider): | |
name = 'HouseSpider' | |
start_urls = [ | |
'http://fotocasa.es/search/results.aspx?opi=36&bti=2&bsm=&tti=3&prchti=2&cu=es-ES&mode=1&maxp=600&minr=2&fs=true&ts=Barcelona%2520Capital&l=724,9,8,232,376,8019,0,0,0' | |
] | |
def parse(self, response): | |
for li in response.css('div#photo-content ul.listPhotos li'): | |
url, external_id = self.extract_url_info( | |
li.css( | |
'div.property-information a.property-location' | |
) | |
.xpath('@href') | |
.extract()[0] | |
) | |
try: | |
House.get(House.external == external_id) | |
except peewee.DoesNotExist: | |
yield scrapy.Request( | |
url=url, | |
callback=self.parse_secondary_request, | |
meta={ | |
'url' : url, | |
'external_id' : external_id | |
} | |
) | |
def parse_secondary_request(self, response): | |
rooms = response.css('#litRooms b::text').extract()[0] | |
try: | |
bathrooms = response.css('#litBaths b::text').extract()[0] | |
except: | |
bathrooms = 0 | |
street = response.css('h1.property-title::text').extract()[0].strip() | |
comma = street.find(',') | |
street = street[9:comma if comma != -1 else len(street)] | |
price = int(re.search(r'\d+', response.css('#priceContainer::text').extract()[0]).group()) | |
try: | |
floor = int(re.search(r'\d+', response.css('#litFloor::text').extract()[0]).group()) | |
except: | |
floor = 0 | |
duplicate = None | |
insert = False | |
hashes = self.get_image_hashes(response) | |
houses = House.select(House, Picture).join(Picture).where( | |
House.rooms == rooms, | |
House.bathrooms == bathrooms, | |
House.floor == floor | |
) | |
def find(houses, hashes): | |
for house in houses: | |
for picture in house.pictures: | |
for hash in hashes: | |
if hamming_distance(picture.hash, hash) <= 10: | |
return house | |
return None | |
if houses.count() == 0: | |
duplicate = None | |
else: | |
duplicate = find(houses, hashes) | |
if duplicate == None: | |
insert = True | |
else: | |
if duplicate.price > price: | |
duplicate.delete_instance(True) | |
insert = True | |
if insert == True: | |
house = House( | |
bathrooms = bathrooms, | |
floor = floor, | |
price = price, | |
rooms = rooms, | |
external = response.meta['external_id'], | |
street = street, | |
url = response.meta['url'] | |
) | |
house.save() | |
for hash in hashes: | |
picture = Picture( | |
house = house.id, | |
hash = hash | |
) | |
picture.save() | |
def get_image_hashes(self, response): | |
hashes = [] | |
for img in response.css('.carousel_slide img'): | |
path = img.xpath('@data-src').extract() | |
if len(path) == 0: | |
path = img.xpath('@src').extract() | |
path = path[0] | |
path = cStringIO.StringIO(urllib.urlopen(path).read()) | |
hashes.append(dhash(Image.open(path))) | |
return hashes | |
def extract_url_info(self, long_url): | |
url = long_url[0:long_url.find('?')] | |
external_id = url[url.rfind('-') + 1:] | |
return url, external_id |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- -------------------------------------------------------- | |
-- | |
-- Table structure for table `house` | |
-- | |
CREATE TABLE IF NOT EXISTS `house` ( | |
`id` int(10) unsigned NOT NULL, | |
`external_id` int(10) unsigned NOT NULL, | |
`rejected` tinyint(1) DEFAULT '0', | |
`url` varchar(256) COLLATE utf16_bin NOT NULL, | |
`rooms` int(11) NOT NULL, | |
`bathrooms` int(11) NOT NULL, | |
`floor` int(11) NOT NULL, | |
`street` text COLLATE utf16_bin NOT NULL, | |
`price` int(11) NOT NULL | |
) ENGINE=InnoDB AUTO_INCREMENT=22 DEFAULT CHARSET=utf16 COLLATE=utf16_bin; | |
-- -------------------------------------------------------- | |
-- | |
-- Table structure for table `picture` | |
-- | |
CREATE TABLE IF NOT EXISTS `picture` ( | |
`id` int(10) unsigned NOT NULL, | |
`house_id` int(10) unsigned NOT NULL, | |
`hash` varchar(255) COLLATE utf16_bin NOT NULL | |
) ENGINE=InnoDB AUTO_INCREMENT=204 DEFAULT CHARSET=utf16 COLLATE=utf16_bin; | |
-- | |
-- Indexes for dumped tables | |
-- | |
-- | |
-- Indexes for table `house` | |
-- | |
ALTER TABLE `house` | |
ADD PRIMARY KEY (`id`), ADD UNIQUE KEY `external_id` (`external_id`), ADD KEY `rejected` (`rejected`); | |
-- | |
-- Indexes for table `picture` | |
-- | |
ALTER TABLE `picture` | |
ADD PRIMARY KEY (`id`), ADD KEY `house_id` (`house_id`); | |
-- | |
-- AUTO_INCREMENT for dumped tables | |
-- | |
-- | |
-- AUTO_INCREMENT for table `house` | |
-- | |
ALTER TABLE `house` | |
MODIFY `id` int(10) unsigned NOT NULL AUTO_INCREMENT,AUTO_INCREMENT=22; | |
-- | |
-- AUTO_INCREMENT for table `picture` | |
-- | |
ALTER TABLE `picture` | |
MODIFY `id` int(10) unsigned NOT NULL AUTO_INCREMENT,AUTO_INCREMENT=204; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment