Skip to content

Instantly share code, notes, and snippets.

@AKST
Last active September 2, 2024 04:46
Show Gist options
  • Save AKST/e9187f8c61be3e16af17740c440dbcf2 to your computer and use it in GitHub Desktop.
Save AKST/e9187f8c61be3e16af17740c440dbcf2 to your computer and use it in GitHub Desktop.
(Work in progress) NSW Valuer General Property Description Parser. This parsers the "property description" field of the bulk data. You're welcome to use this as you please.
from dataclasses import dataclass, field
from collections import namedtuple
from typing import List, Union, Any, Optional
import re
@dataclass
class LandParcel:
id: str
part: bool = field(default=False)
PublicReserve = namedtuple("PublicReserve", ['id'])
BusDepotLease = namedtuple("BusDepotLease", ['id'])
WindFarm = namedtuple("WindFarm", ['id'])
TelstraSite = namedtuple("TelstraSite", ['id'])
SydneyPortsCorporationPlan = namedtuple('SydneyPortsCorporationPlan', ['site', 'plan'])
@dataclass
class CoalLease:
"""
For anything matching the grammar "Coal Lease \w+"
"""
id: str
part: Optional[bool] = field(default=False)
@dataclass
class ConsolidatedCoalLease:
"""
For anything matching the grammar "Consolidated Coal Lease \w+"
"""
id: str
part: bool
@dataclass
class ConsolidatedMiningLease:
"""
For anythign matching the grammar "Consolidated Mining Lease \w+"
"""
id: str
# https://ablis.business.gov.au/service/nsw/mining-lease/16580
@dataclass
class MiningLease:
"""
For anything matching the grammar "Mining Lease \w+"
"""
id: str
part: Optional[bool] = field(default=False)
# https://ablis.business.gov.au/service/nsw/mining-lease/16580
@dataclass
class MiningPurposeLease:
"""
For anything matching the grammar "Mining Purpose Lease \w+"
"""
id: str
@dataclass
class MineralClaim:
id: str
@dataclass
class MineralLease:
id: str
part: Optional[bool] = field(default=False)
@dataclass
class WesternLandLease:
id: str
@dataclass
class RailwayLandLease:
id: str
@dataclass
class RailcorpFile:
"""
For anything matching the grammar "RAILCORP. FILE: \w+"
"""
id: str
# https://www.lls.nsw.gov.au/help-and-advice/travelling-stock-reserves/stock-watering-place-leases
@dataclass
class StockWaterPlaceLease:
id: str
@dataclass
class PermissiveOccupancy:
"""
For parising anything with the grammar "Permissive Occupancy \w+"
"""
id: str
@dataclass
class OccupancyPermit:
"""
For parsing the grammar "Occupancy Permit \w+"
"""
id: str
@dataclass
class LeaseNumber:
"""
For parsing anything that has the grammar "Lease Number \w+( - \w+)"
"""
id_a: str
id_b: Optional[str]
@dataclass
class ForestPermit:
id: str
@dataclass
class EnclosurePermit:
id: str
@dataclass
class DomesticWaterfrontOccupation:
id: str
@dataclass
class NonIrrigablePurchase:
id: str
@dataclass
class StateHeritageRegister:
id: str
@dataclass
class HousingPRN:
"""
This is for the grammar "Housing PRN \w+"
I am unsure what this actually is.
"""
id: str
@dataclass
class CrownLandLicense:
"""
This is presumably a license to use crown land, the grammar is "Licence \w+"
Very likely this:
- https://ablis.business.gov.au/service/nsw/general-licence/40950?locations=NSW
- https://www.crownland.nsw.gov.au/licences-leases-and-permits/do-i-need-licence-or-lease
"""
id: str
@dataclass
class CrownReserve:
"""
For parsing the grammar "Crown Reserve \w+"
"""
id: str
@dataclass
class CrownPlan:
"""
For parsing the grammar "Crown Reserve \w+"
"""
id_a: str
id_b: str
part: bool
# https://www.jstor.org/stable/26875607
@dataclass
class SpecialLease:
"""
For parsing the grammar of "Special Lease \w+"
"""
id: str
# https://www.transport.nsw.gov.au/operations/roads-and-waterways/waterways/property-planning/maritime-development/maritime
@dataclass
class NswMaritime:
id: str
@dataclass
class IsDrainageReserve:
pass
ParsedItem = Union[
LandParcel,
WindFarm,
MineralClaim,
MineralLease,
WesternLandLease,
RailwayLandLease,
ForestPermit,
EnclosurePermit,
IsDrainageReserve,
]
@dataclass
class IdPattern:
re: Any
Const: Any
@dataclass
class NamePattern:
re: Any
id_names: List[str]
Const: Any
bool_names: List[bool] = field(default_factory=lambda: [])
@dataclass
class FlagPattern:
re: Any
Const: Any
id_patterns = [
IdPattern(re=re.compile(r'Wind Farm\s+(\w+)'), Const=WindFarm),
IdPattern(re=re.compile(r"Consolidated Mining Lease\s+(\w+)"), Const=ConsolidatedMiningLease),
IdPattern(re=re.compile(r'Public Reserve\s+(\w+)'), Const=PublicReserve),
# there doesn't appear to be such a thing as a mining permit
#
# https://ablis.business.gov.au/search/customsearch
IdPattern(re=re.compile(r'Mining Permit\s+(\w+)'), Const=MiningLease),
IdPattern(re=re.compile(r'Telstra Site Number\s+(\w+)'), Const=TelstraSite),
IdPattern(re=re.compile(r'Mineral Claim\s+(\w+)'), Const=MineralClaim),
IdPattern(re=re.compile(r'Western Land Lease\s+(\w+)'), Const=WesternLandLease),
IdPattern(re=re.compile(r'Railway Land Lease\s+(\w+(\.\w+)?)'), Const=RailwayLandLease),
IdPattern(re=re.compile(r'Stock Watering Place\s+(\w+)'), Const=StockWaterPlaceLease),
IdPattern(re=re.compile(r'Special Lease\s+(\w+)'), Const=SpecialLease),
IdPattern(re=re.compile(r'Forest Permit\s+(\w+)'), Const=ForestPermit),
IdPattern(re=re.compile(r'Enclosure Permit\s+(\w+)'), Const=EnclosurePermit),
IdPattern(re=re.compile(r'Non-Irrigable Purchase\s+(\w+)'), Const=NonIrrigablePurchase),
IdPattern(re=re.compile(r'NSW Maritime\s+(\w+)'), Const=NswMaritime),
IdPattern(re=re.compile(r'Housing PRN\s+(\w+)'), Const=HousingPRN),
IdPattern(re=re.compile(r'Licence\s+(\w+)'), Const=CrownLandLicense),
IdPattern(re=re.compile(r'BUS DEPOT LEASE\s+(\w+)'), Const=BusDepotLease),
# IdPattern(re=re.compile(r'STATE HERITAGE REGISTRAR\s+(\w+)', re.IGNORECASE), Const=StateHeritageRegister),
IdPattern(re=re.compile(r'State Heritage Listing No\s+(\w+)', re.IGNORECASE), Const=StateHeritageRegister),
IdPattern(re=re.compile(r'Permissive Occupancy\s+(\w+)'), Const=PermissiveOccupancy),
IdPattern(re=re.compile(r'RAILCORP\. FILE:\s+(\w+)', re.IGNORECASE), Const=RailcorpFile),
IdPattern(re=re.compile(r'Domestic Waterfront Occupancy\s+(\w+)'), Const=DomesticWaterfrontOccupation),
# I am guessing these are the same things
IdPattern(re=re.compile(r'Occupation Permit PB\s+(\w+)'), Const=OccupancyPermit),
IdPattern(re=re.compile(r'Occupation Permit\s+(\w+)'), Const=OccupancyPermit),
IdPattern(re=re.compile(r'Occupancy Permit\s+(\w+)'), Const=OccupancyPermit),
]
named_group_patterns = [
NamePattern(
re=re.compile(r'State Heritage\s+(Register|REGISTRAR)\s+(SHR\s+NO\.\s+)?(?P<id>\w+)', re.IGNORECASE),
id_names=['id'],
Const=StateHeritageRegister,
),
NamePattern(
re=re.compile(r'Subject to (SHR|SHRL) No\s+(?P<id>\w+)', re.IGNORECASE),
id_names=['id'],
Const=StateHeritageRegister,
),
NamePattern(
re=re.compile(r'Crown Reserve\s+(?P<id>\w+)'),
id_names=['id'],
Const=CrownReserve,
),
NamePattern(
re=re.compile(r'Part Crown Plan\s+(?P<id_a>\w+)-(?P<id_b>\w+)'),
id_names=['id_a', 'id_b'],
Const=lambda **kwargs: CrownPlan(**kwargs, part=True),
),
NamePattern(
re=re.compile(r'Crown Plan\s+(?P<id_a>\w+)-(?P<id_b>\w+)(?P<part>\s+\(Part\))?'),
id_names=['id_a', 'id_b'],
bool_names=['part'],
Const=CrownPlan,
),
# Mining Lease — You will need a mining lease to extract minerals for the PURPOSE of commercial mining
# via: https://ablis.business.gov.au/service/nsw/mining-lease/16580
NamePattern(
re=re.compile(r'Mining (Purpose )?Lease\s+(?P<id>\w+)(?P<part>\s+\(Part\))?', re.IGNORECASE),
id_names=['id'],
bool_names=['part'],
Const=MiningLease,
),
NamePattern(
re=re.compile(r"Consolidated Coal Lease\s+(?P<id>\w+)(?P<part>\s+\(Part\))?", re.IGNORECASE),
id_names=['id'],
bool_names=['part'],
Const=ConsolidatedCoalLease,
),
NamePattern(
re=re.compile(r'Coal Lease\s+(?P<id>\w+)(?P<part>\s+\(Part\))?', re.IGNORECASE),
id_names=['id'],
bool_names=['part'],
Const=CoalLease,
),
NamePattern(
re=re.compile(r'Mineral Lease\s+(?P<id>\w+)(?P<part>\s+\(Part\))?', re.IGNORECASE),
id_names=['id'],
bool_names=['part'],
Const=MineralLease,
),
NamePattern(
re=re.compile(r'Lease Number\s+(?P<id_a>\w+) TO (?P<id_b>\w+)'),
id_names=['id_a', 'id_b'],
Const=LeaseNumber,
),
NamePattern(
re=re.compile(r'Lease Number\s+(?P<id_a>\w+)( - (?P<id_b>\w+))?'),
id_names=['id_a', 'id_b'],
Const=LeaseNumber,
),
NamePattern(
re=re.compile(r'Site\s+(?P<site>\w+) of Sydney Ports Corporation Plan\s+(?P<plan>\w+)'),
id_names=['site', 'plan'],
Const=SydneyPortsCorporationPlan,
),
]
flag_patterns = [
FlagPattern(re=re.compile('DRAINAGE RESERVE'), Const=IsDrainageReserve),
]
ignore_pre_patterns = [
re.compile(r"Licence for grazing"),
re.compile(r"NSW Maritime Lease of \w+ sqm('s)?", re.IGNORECASE)
]
ignore_post_patterns = [
# TOTAL SUBSURFACE AREA = 1090.5 HA
re.compile(r"(TOTAL )?SU(B)?SURFACE\s+(AREA\s+)?(=|-)?\s+(\w+)(\.\w+)?(\s+)?HA", re.IGNORECASE),
re.compile(r"UNDEFINED ROAD RESERVE"),
re.compile(r"& road reserve", re.IGNORECASE),
re.compile(r"Crown Road"),
re.compile(r"THE WANGANELLA WILDLIFE REFUGE NO \w+", re.IGNORECASE),
re.compile(r"COONONG WILDLIFE REFUGE (NO )?\w+", re.IGNORECASE),
re.compile(r'(Partly )?(Un)?Lim(it|ti)ed in height (and|but|\&) ((Un)?lim(it|ti)ed in )?depth', re.IGNORECASE),
re.compile(r'(Partly )?(Un)?Lim(it|ti)ed in depth (and|but|\&) ((Un)?lim(it|ti)ed in )?height', re.IGNORECASE),
re.compile(r'(Partly )?(Un)?Lim(it|ti)ed in (depth|height)', re.IGNORECASE),
re.compile(r'LEASE ATTACHED TO PROPERTY', re.IGNORECASE),
re.compile(r'Limited in Stratum', re.IGNORECASE),
re.compile(r'(lease|least) (OVER|OVE) property', re.IGNORECASE),
re.compile(r'PROPERTY (OVER|OVE) LEASE', re.IGNORECASE),
re.compile(r'SUBSURFACE ONLY', re.IGNORECASE),
# only appears on 3 different properties
re.compile(r'Share Use', re.IGNORECASE),
re.compile(r'Shared Use', re.IGNORECASE),
# I saw this stand alone after a parcel number with zero context.
re.compile(r'floor space area'),
# idk what's going with this...
# it's often trailing after licenses of permits
# re.compile(r'(PART)'),
]
def parse_land_parcel_ids(desc: str) -> List[LandParcel]:
def read_chunk(read_from, skip = 0):
copy = desc[read_from:]
while skip > 0:
copy = copy[copy.find(' ') + 1:]
skip -= 1
if copy.find(' ') == -1:
return copy
else:
return copy[:copy.find(' ')]
def move_cursor(read_from, skip = 0):
while skip > 0:
if desc[read_from:].find(' ') == -1:
return len(desc)
read_from = read_from + desc[read_from:].find(' ') + 1
skip -= 1
return read_from
def impl():
read_from = 0
chunk = None
while read_from < len(desc):
chunk = read_chunk(read_from, skip=0)
# print(read_from, desc[read_from:], chunk, f"'{read_chunk(read_from, skip=1)}'")
if '/' in chunk:
yield LandParcel(id=chunk, part=False)
read_from = move_cursor(read_from, 1)
continue
if 'PT' == chunk and '/' in read_chunk(read_from, skip=1):
yield LandParcel(id=read_chunk(read_from, skip=1), part=True)
read_from = move_cursor(read_from, 2)
continue
if 'PT' != chunk and not chunk.endswith(','):
return desc[read_from:]
lots = []
plan = ''
while True:
chunk = read_chunk(read_from)
part = False
if chunk == 'PT':
part = True
read_from = move_cursor(read_from, 1)
chunk = read_chunk(read_from)
if chunk.endswith(','):
lots.append((part, chunk[:-1]))
read_from = move_cursor(read_from, 1)
continue
elif '/' in chunk:
lots.append((part, chunk[:chunk.find('/')]))
plan = chunk[chunk.find('/'):]
read_from = move_cursor(read_from, 1)
break
else:
return desc[read_from:]
for part, lot in lots:
yield LandParcel(id=f'{lot}{plan}', part=part)
return desc[read_from:]
land_parcels: List[LandParcel] = []
gen = impl()
try:
while True:
land_parcels.append(next(gen))
except StopIteration as e:
return e.value, land_parcels
def parse_property_description(description: str) -> List[ParsedItem]:
description = re.sub(r'\s+', ' ', description)
parsed_items: List[ParsedItem] = []
for pattern in ignore_pre_patterns:
description = pattern.sub('', description)
for pattern in id_patterns:
for match in pattern.re.finditer(description):
parsed_items.append(pattern.Const(id=match.group(1)))
description = pattern.re.sub('', description)
for pattern in named_group_patterns:
for match in pattern.re.finditer(description):
parsed_item = pattern.Const(
**{ k: match.group(k) for k in pattern.id_names },
**{
k: match.group(k) is not None
for k in pattern.bool_names
},
)
parsed_items.append(parsed_item)
description = pattern.re.sub('', description)
for pattern in flag_patterns:
for match in pattern.re.finditer(description):
parsed_items.append(pattern.Const())
description = pattern.re.sub('', description)
for pattern in ignore_post_patterns:
description = pattern.sub('', description)
description = re.sub(r'\s+', ' ', description)
description, land_parcels = parse_land_parcel_ids(description)
description = re.sub(r'(\s+|\.)+', '', description)
parsed_items.extend(land_parcels)
return description, parsed_items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment