Skip to content

Instantly share code, notes, and snippets.

@AKST
Last active September 2, 2024 06:18
Show Gist options
  • Save AKST/1d3d31606bab417faa8150f07e22ca0b to your computer and use it in GitHub Desktop.
Save AKST/1d3d31606bab417faa8150f07e22ca0b to your computer and use it in GitHub Desktop.
Parser for NSW Valuer General Bulk Property Sales data, found here https://valuation.property.nsw.gov.au/embed/propertySalesInformation
from dataclasses import dataclass
from datetime import datetime
from typing import Optional
import os
def get_property_sales_data(file_path):
reader = PropertySalesRowReader()
try:
for kind, row in reader.get_rows(file_path):
if kind == 'A':
yield SaleRecordFile.from_row(row, file_path)
elif kind == 'B':
yield SalePropertyDetails.from_row(row)
elif kind == 'C':
yield SalePropertyLegalDescription.from_row(row)
elif kind == 'D':
yield SaleParticipant.from_row(row)
elif kind == 'Z':
yield SaleDataFileSummary.from_row(row)
else:
raise ValueError(f"Unexpected record type: {kind}")
except Exception as e:
print(kind, row, file_path, reader.remaining_buffer())
raise e
_SEMI_COLONS = { 'A': 5, 'B': 24, 'C': 6, 'D': 11, 'Z': 5 }
class PropertySalesRowReader:
def __init__(self, buffer_size=8192):
self._buffer_size = buffer_size
self._file_size = 0
self._buffer = ""
self._index = 0
self._buffer_start = 0
self._file = None
def get_rows(self, file_path):
with open(file_path, 'r') as file:
self._file = file
self._file_size = os.path.getsize(file_path)
self._load_buffer(0)
while self._index < self._file_size:
mode = self._read_buffer(self._index, self._index + 1)
if mode not in _SEMI_COLONS:
raise ValueError(f"Unexpected mode: {mode}")
row_end = self._find_nth_semicolon(_SEMI_COLONS[mode])
if row_end == -1:
raise ValueError('Unexpected end of data')
yield mode, self._read_buffer(self._index + 2, row_end).split(';')
self._index = row_end + 2
if mode == 'Z':
break
def remaining_buffer(self):
return self._buffer[self._index - self._buffer_start:]
def _load_buffer(self, start_index):
self._file.seek(start_index)
self._buffer = self._file.read(self._buffer_size)
self._buffer_start = start_index
def _find_nth_semicolon(self, n):
position = self._index
while n > 0:
found_index = self._buffer.find(';', position - self._buffer_start)
if found_index == -1:
self._load_buffer(self._index)
found_index = self._buffer.find(';', position - self._buffer_start)
if found_index == -1:
return -1
position = found_index + self._buffer_start + 1
n -= 1
return found_index + self._buffer_start
def _read_buffer(self, start, end):
if end - self._buffer_start > len(self._buffer):
self._load_buffer(self._index)
return self._buffer[start - self._buffer_start:end - self._buffer_start]
def parse_datetime(date_str):
return datetime.strptime(date_str, "%Y%m%d %H:%M")
def parse_date(date_str):
return datetime.strptime(date_str, "%Y%m%d")
@dataclass
class SaleRecordFile:
district: int
file_path: str
file_type: str
date_downloaded: datetime
submitting_user_id: Optional[str]
@staticmethod
def from_row(row, file_path=None):
return SaleRecordFile(
file_path=file_path,
file_type=row[0],
district=int(row[1]),
date_downloaded=parse_datetime(row[2]),
submitting_user_id=row[3],
)
@dataclass
class SalePropertyDetails:
district: int
property_id: str
sale_counter: str
date_downloaded: datetime
property_name: Optional[str]
property_unit_number: Optional[str]
property_house_number: Optional[str]
property_street_name: Optional[str]
property_locality: Optional[str]
property_postcode: Optional[int]
area: Optional[float]
area_type: Optional[str]
contract_date: datetime
settlement_date: datetime
purchase_price: float
zoning: Optional[str]
nature_of_property: Optional[int]
primary_purpose: Optional[int]
strata_lot_number: Optional[int]
component_code: Optional[int]
sale_code: Optional[int]
interest_of_sale: Optional[int]
dealing_number: str
@staticmethod
def from_row(row):
return SalePropertyDetails(
district=int(row[0]),
property_id=row[1],
sale_counter=row[2],
date_downloaded=parse_datetime(row[3]),
property_name=row[4] or None,
property_unit_number=row[5] or None,
property_house_number=row[6] or None,
property_street_name=row[7] or None,
property_locality=row[8] or None,
property_postcode=int(row[9]) if row[9] else None,
area=float(row[10]) if row[10] else None,
area_type=row[11] or None,
contract_date=parse_date(row[12]),
settlement_date=parse_date(row[13]),
purchase_price=float(row[14]),
zoning=row[15] or None,
nature_of_property=row[16] or None,
primary_purpose=row[17] or None,
strata_lot_number=row[18] or None,
component_code=row[19] or None,
sale_code=row[20] or None,
interest_of_sale=row[21] or None,
dealing_number=row[22],
)
@dataclass
class SalePropertyLegalDescription:
district: int
property_id: str
sale_counter: str
date_downloaded: datetime
property_legal_description: str
@staticmethod
def from_row(row):
return SalePropertyLegalDescription(
district=int(row[0]),
property_id=row[1],
sale_counter=row[2],
date_downloaded=parse_datetime(row[3]),
property_legal_description=row[4],
)
@dataclass
class SaleParticipant:
district: int
property_id: str
sale_counter: str
date_downloaded: datetime
participant: str
@staticmethod
def from_row(row):
return SaleParticipant(
district=int(row[0]),
property_id=row[1],
sale_counter=row[2],
date_downloaded=parse_datetime(row[3]),
participant=row[4],
)
@dataclass
class SaleDataFileSummary:
total_records: int
total_sale_property_details: int
total_sale_property_legal_descriptions: int
total_sale_participants: int
@staticmethod
def from_row(row):
return SaleDataFileSummary(
total_records=int(row[0]),
total_sale_property_details=int(row[1]),
total_sale_property_legal_descriptions=int(row[2]),
total_sale_participants=int(row[3]),
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment