Skip to content

Instantly share code, notes, and snippets.

@timj
Created February 17, 2015 21:49
Show Gist options
  • Save timj/4e8977aec24609b8b605 to your computer and use it in GitHub Desktop.
Save timj/4e8977aec24609b8b605 to your computer and use it in GitHub Desktop.
Simple demo for matching more than 2 catalogs at once
#!/usr/bin/env python
"""
Purpose: Take a collection of catalogues and match sources
For each matched source we need to know:
1. The catalogue rows that were deemed to be the same source
2. The catalogues that did not contain a specific source despite
the areal coverage of the catalogue including the position.
Algorithm
---------
First catalogue is "reference".
For each subsequent catalog:
- Determine sky coverage of the new catalog
- Subset reference catalog to only include sources
within the same region
- Do the cross match (allowing multiple matches within 1 arcsec)
- + Append matches from new catalog to data structure indexed by reference source ID
+ Note IDs in reference (subset) that did not match anything in new
+ Note IDs in reference that had multiple matches in new
+ Append sources in the new catalog that did not match anything in reference to
end of reference (and associate them with the knowledge that they were not seen
in the previous N catalogs)
- Repeat for next
At this point we have one catalog with all the sources that were un-matched
at some point, along with a dict with all the matches for each row in that
catalog, and an indication of the catalogs the source was not seen in.
The next step is to handle the cases where a catalog had multiple matches.
This is where it gets complicated: it could be that one source split in to 2
sources because seeing got better or resolution differences at different filters.
If the reference catalog had one source that became two, there would only be
a single source in the "reference" but it would have two separate sources
in the match. If the two became one the "one" would be in the reference list
twice.
A self-match on the reference catalog would give us all the targets requiring
more investigation when combined with the N-to-1 list.
"""
from __future__ import (absolute_import, division,
print_function)
import os
import collections
import pprint
import lsst.daf.persistence as dafPersist
import lsst.afw.geom as afwGeom
import lsst.afw.table as afwTable
import starlink.Ast as pyast
import palpy as pal
import numpy as np
import matplotlib.pyplot as plt
def exposureToRegion ( exposure ):
"""
Given an Exposure object, calculate and return an AST region
corresponding to the exposure area.
"""
# We need the WCS
wcs = exposure.getWcs()
dims = exposure.getDimensions()
# Get the RA/Dec of the corners
xcoords = []
ycoords = []
# Polygon goes through corners counter clockwise
for x,y in zip( (-0.5,dims[0]-0.5,dims[0]-0.5,-0.5), (-0.5, -0.5, dims[1]-0.5, dims[1]-0.5) ):
corner = wcs.pixelToSky( x, y )
icrs = corner.toIcrs()
#print_coords(icrs[0], icrs[1])
xcoords.append(icrs.getRa().asRadians())
ycoords.append(icrs.getDec().asRadians())
region = pyast.Polygon( pyast.SkyFrame( "system=ICRS" ), [xcoords, ycoords] )
return region
def catWithinRegion ( cat, region ):
"""
Given a SourceCatalog and Region, return a new catalog containing
sources in SourceCatalog that lie within Region.
"""
k = cat.getCoordKey()
# the AST tran method requires a 2xN numpy array
# May need to make tran2() method public to avoid the temporary
# numpy array.
mapcoords = region.tran([ cat[k.getRa()], cat[k.getDec()] ])
# Coordinates within the region will be the same.
# Coordinates outside the region will be AST__BAD.
# Only need one half the coordinates to check
mask = ( mapcoords[0,:] != pyast.BAD )
#print(cat[0].getId(), cat[mask][0].getId())
return cat[mask].copy(deep=True)
def srcWithinRegion( srcRecord, region ):
"""
Given a SourceRecord, return True if the source record
lies within the region, otherwise return False.
"""
mapped = region.tran( [ [srcRecord.getRa()], [srcRecord.getDec()] ])
return (mapped[0] != pyast.BAD)
def print_coords(rra, rdec):
"""
Given ra and dec in radians, print out in sexagesimal.
Uses PAL for convenience. Could be done with pyast or presumably
afw.
"""
(sign, rh, rm, rs, rfrac) = pal.dr2tf( 3, rra )
(sign, dd, dm, ds, dfrac) = pal.dr2af( 4, rdec )
print("{}h{}m{}.{}s {}{}d{}m{}.{}s".format(rh, rm, rs,rfrac, sign, dd, dm, ds, dfrac ))
def match_catalogs( catalogs, regions, match_radius=1*afwGeom.arcseconds ):
"""
Given catalogs and associated region definitions defining
the areal coverage of each catalog, merge them into a single
catalog indicating all the cross-matches.
Returns:
- combined catalog with all distinct items
- dict indexed by ID in combined catalog, containing
a list of all items that matched that ID.
- a list of lists containing all the matching source records
"""
# We want to keep track of all the catalog entries
# that have been matched. Use a collection of lists
# where each list contains SourceRecord objects that match
# This is similar to a Match object but can contain more than
# two SourceRecords.
allmatches = []
matchbyid = collections.OrderedDict()
pp = pprint.PrettyPrinter(indent=4)
# need a copy of the supplied list so that we can modify it
catalogs = catalogs[:]
# First catalog is the reference
# We seed the datastructure
refcat = catalogs.pop(0)
# Keep track of regions that we've already processed
# starting with the first one (which is assumed to match refcat)
prevreg = [ regions.pop(0) ]
# Known duplicates, dict indexed by refcat ID
dupes = dict()
# Also fill dict to allow lookup by ID
for s in refcat:
seed = [ s ]
allmatches.append( seed )
matchbyid[ s.getId() ] = seed
# Now we compare with each remaining catalog in turn
for (n, c, reg) in zip(range(len(catalogs)), catalogs, regions):
# Filter the reference catalog according to the region
tmpcat = catWithinRegion( refcat, reg )
# Now do the match between the current reference and the new catalog.
# Return multiple matches if appropriate
print("Ref catalog has {} items, new catalog has {}".format(len(tmpcat),len(c)))
matches = afwTable.matchRaDec(tmpcat, c, match_radius, False)
print ("{} matches out of {} and {}".format(len(matches), len(tmpcat), len(c)))
# Create dict indexed by second catalog IDs so we can easily
# keep track of non-matches
catbyid = collections.OrderedDict()
for sr in c:
catbyid[ sr.getId() ] = sr
# Also keep track of the reference catalog IDs so that we can keep
# track of entries that did not match
refcatbyid = collections.OrderedDict()
for sr in tmpcat:
refcatbyid[ sr.getId() ] = sr
# Now go through the match, recording the SourceRecord
# in the master list, deleting entries from the dict as we go
# s2 can appear more than once
found = dict()
for s1, s2, d in matches:
refId = s1.getId()
s2Id = s2.getId()
matchbyid[ refId ].append( s2 )
if s2Id in catbyid:
del catbyid[ s2Id ]
found[s2Id] = refId
else:
print("{} was matched twice ({})".format(s2Id, refId))
for id in (refId, found[s2Id]):
if not id in dupes:
dupes[id] = []
dupes[id].append( s2Id )
# Indicate that we have processed the entry from refcat
del refcatbyid[ refId ]
print("{} items failed to match from cat B and {} from cat A".format(len(catbyid),len(refcatbyid)))
# Sources from the (filtered) reference catalog that failed to match should now
# have that faiure registered. Null-match should really be an object
# that looks like a SourceRecord but which has flux of zero and "bad" ID
# but which is associated with the relevant non-matching catalog.
# For now just log the index into the catalog, accounting for the missing ref catalog
for id in refcatbyid.keys():
matchbyid[ id ].append( n+1 )
# We now have a list of unmatched sources from the current catalog
# which will need to have the failing match status noted for the previous
# N catalogs.
for id, sr in catbyid.items():
matches = []
rcounter = 0
for oldreg in prevreg:
if srcWithinRegion( sr, oldreg ):
matches.append( rcounter )
rcounter = rcounter + 1
matches.append( sr ) # Need to finish with this record
matchbyid[ id ] = matches
allmatches.append( matches )
# Now add these unmatched items to the reference catalogue
refcat.extend( catbyid.values() )
# catWithinRegion currently requires contiguous memory because
# it uses getRa() etc. That may not be a good idea but for now
# we need to make a deep copy of the catalog either here or in catWithinRegion
refcat = refcat.copy(deep=True)
# Store this region
prevreg.append( reg )
print("Total size of catalogue: {}".format(len(allmatches)))
print("Duplicates during initial match:")
pp.pprint(dupes)
# Now do a self match on the result to find possible confusion
selfmatch = afwTable.matchRaDec( refcat, 1.5*match_radius, False )
print("Self match got {} results".format(len(selfmatch)))
for s1,s2,dist in selfmatch:
d = dist * pal.DR2AS
print("{} self matches {} (d={} arcsec)".format(s1.getId(), s2.getId(),d))
return (refcat, matchbyid, allmatches)
if __name__ == '__main__':
# Test with the obs_sdss stack demo data
root = "/Users/timj/work/lsst-builds/v10_0/lsst_dm_stack_demo/output"
butler = dafPersist.Butler(root)
# Get a set of catalogues for these filters
FILTER_MAP = {'u': 0, 'g':1, 'r': 2, 'i': 3, 'z': 4, 'y': 5}
filters ="ugriz" # "ugriz" are the full set in this case
run = 6377
field = 399
cat = []
regions =[]
for filter in filters:
dataId = dict(run=run, field=field, filter=filter, camcol=4)
cat.append(butler.get('src', dataId))
img = butler.get('calexp', dataId)
regions.append( exposureToRegion( img ) )
# Now we have catalogs and regions we do the merge
(mergedcat, mergeddict, allmatches) = match_catalogs( cat, regions )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment