Last active
September 2, 2024 06:18
-
-
Save AKST/1d3d31606bab417faa8150f07e22ca0b to your computer and use it in GitHub Desktop.
Parser for NSW Valuer General Bulk Property Sales data, found here https://valuation.property.nsw.gov.au/embed/propertySalesInformation
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from dataclasses import dataclass | |
from datetime import datetime | |
from typing import Optional | |
import os | |
def get_property_sales_data(file_path): | |
reader = PropertySalesRowReader() | |
try: | |
for kind, row in reader.get_rows(file_path): | |
if kind == 'A': | |
yield SaleRecordFile.from_row(row, file_path) | |
elif kind == 'B': | |
yield SalePropertyDetails.from_row(row) | |
elif kind == 'C': | |
yield SalePropertyLegalDescription.from_row(row) | |
elif kind == 'D': | |
yield SaleParticipant.from_row(row) | |
elif kind == 'Z': | |
yield SaleDataFileSummary.from_row(row) | |
else: | |
raise ValueError(f"Unexpected record type: {kind}") | |
except Exception as e: | |
print(kind, row, file_path, reader.remaining_buffer()) | |
raise e | |
_SEMI_COLONS = { 'A': 5, 'B': 24, 'C': 6, 'D': 11, 'Z': 5 } | |
class PropertySalesRowReader: | |
def __init__(self, buffer_size=8192): | |
self._buffer_size = buffer_size | |
self._file_size = 0 | |
self._buffer = "" | |
self._index = 0 | |
self._buffer_start = 0 | |
self._file = None | |
def get_rows(self, file_path): | |
with open(file_path, 'r') as file: | |
self._file = file | |
self._file_size = os.path.getsize(file_path) | |
self._load_buffer(0) | |
while self._index < self._file_size: | |
mode = self._read_buffer(self._index, self._index + 1) | |
if mode not in _SEMI_COLONS: | |
raise ValueError(f"Unexpected mode: {mode}") | |
row_end = self._find_nth_semicolon(_SEMI_COLONS[mode]) | |
if row_end == -1: | |
raise ValueError('Unexpected end of data') | |
yield mode, self._read_buffer(self._index + 2, row_end).split(';') | |
self._index = row_end + 2 | |
if mode == 'Z': | |
break | |
def remaining_buffer(self): | |
return self._buffer[self._index - self._buffer_start:] | |
def _load_buffer(self, start_index): | |
self._file.seek(start_index) | |
self._buffer = self._file.read(self._buffer_size) | |
self._buffer_start = start_index | |
def _find_nth_semicolon(self, n): | |
position = self._index | |
while n > 0: | |
found_index = self._buffer.find(';', position - self._buffer_start) | |
if found_index == -1: | |
self._load_buffer(self._index) | |
found_index = self._buffer.find(';', position - self._buffer_start) | |
if found_index == -1: | |
return -1 | |
position = found_index + self._buffer_start + 1 | |
n -= 1 | |
return found_index + self._buffer_start | |
def _read_buffer(self, start, end): | |
if end - self._buffer_start > len(self._buffer): | |
self._load_buffer(self._index) | |
return self._buffer[start - self._buffer_start:end - self._buffer_start] | |
def parse_datetime(date_str): | |
return datetime.strptime(date_str, "%Y%m%d %H:%M") | |
def parse_date(date_str): | |
return datetime.strptime(date_str, "%Y%m%d") | |
@dataclass | |
class SaleRecordFile: | |
district: int | |
file_path: str | |
file_type: str | |
date_downloaded: datetime | |
submitting_user_id: Optional[str] | |
@staticmethod | |
def from_row(row, file_path=None): | |
return SaleRecordFile( | |
file_path=file_path, | |
file_type=row[0], | |
district=int(row[1]), | |
date_downloaded=parse_datetime(row[2]), | |
submitting_user_id=row[3], | |
) | |
@dataclass | |
class SalePropertyDetails: | |
district: int | |
property_id: str | |
sale_counter: str | |
date_downloaded: datetime | |
property_name: Optional[str] | |
property_unit_number: Optional[str] | |
property_house_number: Optional[str] | |
property_street_name: Optional[str] | |
property_locality: Optional[str] | |
property_postcode: Optional[int] | |
area: Optional[float] | |
area_type: Optional[str] | |
contract_date: datetime | |
settlement_date: datetime | |
purchase_price: float | |
zoning: Optional[str] | |
nature_of_property: Optional[int] | |
primary_purpose: Optional[int] | |
strata_lot_number: Optional[int] | |
component_code: Optional[int] | |
sale_code: Optional[int] | |
interest_of_sale: Optional[int] | |
dealing_number: str | |
@staticmethod | |
def from_row(row): | |
return SalePropertyDetails( | |
district=int(row[0]), | |
property_id=row[1], | |
sale_counter=row[2], | |
date_downloaded=parse_datetime(row[3]), | |
property_name=row[4] or None, | |
property_unit_number=row[5] or None, | |
property_house_number=row[6] or None, | |
property_street_name=row[7] or None, | |
property_locality=row[8] or None, | |
property_postcode=int(row[9]) if row[9] else None, | |
area=float(row[10]) if row[10] else None, | |
area_type=row[11] or None, | |
contract_date=parse_date(row[12]), | |
settlement_date=parse_date(row[13]), | |
purchase_price=float(row[14]), | |
zoning=row[15] or None, | |
nature_of_property=row[16] or None, | |
primary_purpose=row[17] or None, | |
strata_lot_number=row[18] or None, | |
component_code=row[19] or None, | |
sale_code=row[20] or None, | |
interest_of_sale=row[21] or None, | |
dealing_number=row[22], | |
) | |
@dataclass | |
class SalePropertyLegalDescription: | |
district: int | |
property_id: str | |
sale_counter: str | |
date_downloaded: datetime | |
property_legal_description: str | |
@staticmethod | |
def from_row(row): | |
return SalePropertyLegalDescription( | |
district=int(row[0]), | |
property_id=row[1], | |
sale_counter=row[2], | |
date_downloaded=parse_datetime(row[3]), | |
property_legal_description=row[4], | |
) | |
@dataclass | |
class SaleParticipant: | |
district: int | |
property_id: str | |
sale_counter: str | |
date_downloaded: datetime | |
participant: str | |
@staticmethod | |
def from_row(row): | |
return SaleParticipant( | |
district=int(row[0]), | |
property_id=row[1], | |
sale_counter=row[2], | |
date_downloaded=parse_datetime(row[3]), | |
participant=row[4], | |
) | |
@dataclass | |
class SaleDataFileSummary: | |
total_records: int | |
total_sale_property_details: int | |
total_sale_property_legal_descriptions: int | |
total_sale_participants: int | |
@staticmethod | |
def from_row(row): | |
return SaleDataFileSummary( | |
total_records=int(row[0]), | |
total_sale_property_details=int(row[1]), | |
total_sale_property_legal_descriptions=int(row[2]), | |
total_sale_participants=int(row[3]), | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment