Created
February 17, 2015 21:49
-
-
Save timj/4e8977aec24609b8b605 to your computer and use it in GitHub Desktop.
Simple demo for matching more than 2 catalogs at once
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Purpose: Take a collection of catalogues and match sources | |
For each matched source we need to know: | |
1. The catalogue rows that were deemed to be the same source | |
2. The catalogues that did not contain a specific source despite | |
the areal coverage of the catalogue including the position. | |
Algorithm | |
--------- | |
First catalogue is "reference". | |
For each subsequent catalog: | |
- Determine sky coverage of the new catalog | |
- Subset reference catalog to only include sources | |
within the same region | |
- Do the cross match (allowing multiple matches within 1 arcsec) | |
- + Append matches from new catalog to data structure indexed by reference source ID | |
+ Note IDs in reference (subset) that did not match anything in new | |
+ Note IDs in reference that had multiple matches in new | |
+ Append sources in the new catalog that did not match anything in reference to | |
end of reference (and associate them with the knowledge that they were not seen | |
in the previous N catalogs) | |
- Repeat for next | |
At this point we have one catalog with all the sources that were un-matched | |
at some point, along with a dict with all the matches for each row in that | |
catalog, and an indication of the catalogs the source was not seen in. | |
The next step is to handle the cases where a catalog had multiple matches. | |
This is where it gets complicated: it could be that one source split in to 2 | |
sources because seeing got better or resolution differences at different filters. | |
If the reference catalog had one source that became two, there would only be | |
a single source in the "reference" but it would have two separate sources | |
in the match. If the two became one the "one" would be in the reference list | |
twice. | |
A self-match on the reference catalog would give us all the targets requiring | |
more investigation when combined with the N-to-1 list. | |
""" | |
from __future__ import (absolute_import, division, | |
print_function) | |
import os | |
import collections | |
import pprint | |
import lsst.daf.persistence as dafPersist | |
import lsst.afw.geom as afwGeom | |
import lsst.afw.table as afwTable | |
import starlink.Ast as pyast | |
import palpy as pal | |
import numpy as np | |
import matplotlib.pyplot as plt | |
def exposureToRegion ( exposure ): | |
""" | |
Given an Exposure object, calculate and return an AST region | |
corresponding to the exposure area. | |
""" | |
# We need the WCS | |
wcs = exposure.getWcs() | |
dims = exposure.getDimensions() | |
# Get the RA/Dec of the corners | |
xcoords = [] | |
ycoords = [] | |
# Polygon goes through corners counter clockwise | |
for x,y in zip( (-0.5,dims[0]-0.5,dims[0]-0.5,-0.5), (-0.5, -0.5, dims[1]-0.5, dims[1]-0.5) ): | |
corner = wcs.pixelToSky( x, y ) | |
icrs = corner.toIcrs() | |
#print_coords(icrs[0], icrs[1]) | |
xcoords.append(icrs.getRa().asRadians()) | |
ycoords.append(icrs.getDec().asRadians()) | |
region = pyast.Polygon( pyast.SkyFrame( "system=ICRS" ), [xcoords, ycoords] ) | |
return region | |
def catWithinRegion ( cat, region ): | |
""" | |
Given a SourceCatalog and Region, return a new catalog containing | |
sources in SourceCatalog that lie within Region. | |
""" | |
k = cat.getCoordKey() | |
# the AST tran method requires a 2xN numpy array | |
# May need to make tran2() method public to avoid the temporary | |
# numpy array. | |
mapcoords = region.tran([ cat[k.getRa()], cat[k.getDec()] ]) | |
# Coordinates within the region will be the same. | |
# Coordinates outside the region will be AST__BAD. | |
# Only need one half the coordinates to check | |
mask = ( mapcoords[0,:] != pyast.BAD ) | |
#print(cat[0].getId(), cat[mask][0].getId()) | |
return cat[mask].copy(deep=True) | |
def srcWithinRegion( srcRecord, region ): | |
""" | |
Given a SourceRecord, return True if the source record | |
lies within the region, otherwise return False. | |
""" | |
mapped = region.tran( [ [srcRecord.getRa()], [srcRecord.getDec()] ]) | |
return (mapped[0] != pyast.BAD) | |
def print_coords(rra, rdec): | |
""" | |
Given ra and dec in radians, print out in sexagesimal. | |
Uses PAL for convenience. Could be done with pyast or presumably | |
afw. | |
""" | |
(sign, rh, rm, rs, rfrac) = pal.dr2tf( 3, rra ) | |
(sign, dd, dm, ds, dfrac) = pal.dr2af( 4, rdec ) | |
print("{}h{}m{}.{}s {}{}d{}m{}.{}s".format(rh, rm, rs,rfrac, sign, dd, dm, ds, dfrac )) | |
def match_catalogs( catalogs, regions, match_radius=1*afwGeom.arcseconds ): | |
""" | |
Given catalogs and associated region definitions defining | |
the areal coverage of each catalog, merge them into a single | |
catalog indicating all the cross-matches. | |
Returns: | |
- combined catalog with all distinct items | |
- dict indexed by ID in combined catalog, containing | |
a list of all items that matched that ID. | |
- a list of lists containing all the matching source records | |
""" | |
# We want to keep track of all the catalog entries | |
# that have been matched. Use a collection of lists | |
# where each list contains SourceRecord objects that match | |
# This is similar to a Match object but can contain more than | |
# two SourceRecords. | |
allmatches = [] | |
matchbyid = collections.OrderedDict() | |
pp = pprint.PrettyPrinter(indent=4) | |
# need a copy of the supplied list so that we can modify it | |
catalogs = catalogs[:] | |
# First catalog is the reference | |
# We seed the datastructure | |
refcat = catalogs.pop(0) | |
# Keep track of regions that we've already processed | |
# starting with the first one (which is assumed to match refcat) | |
prevreg = [ regions.pop(0) ] | |
# Known duplicates, dict indexed by refcat ID | |
dupes = dict() | |
# Also fill dict to allow lookup by ID | |
for s in refcat: | |
seed = [ s ] | |
allmatches.append( seed ) | |
matchbyid[ s.getId() ] = seed | |
# Now we compare with each remaining catalog in turn | |
for (n, c, reg) in zip(range(len(catalogs)), catalogs, regions): | |
# Filter the reference catalog according to the region | |
tmpcat = catWithinRegion( refcat, reg ) | |
# Now do the match between the current reference and the new catalog. | |
# Return multiple matches if appropriate | |
print("Ref catalog has {} items, new catalog has {}".format(len(tmpcat),len(c))) | |
matches = afwTable.matchRaDec(tmpcat, c, match_radius, False) | |
print ("{} matches out of {} and {}".format(len(matches), len(tmpcat), len(c))) | |
# Create dict indexed by second catalog IDs so we can easily | |
# keep track of non-matches | |
catbyid = collections.OrderedDict() | |
for sr in c: | |
catbyid[ sr.getId() ] = sr | |
# Also keep track of the reference catalog IDs so that we can keep | |
# track of entries that did not match | |
refcatbyid = collections.OrderedDict() | |
for sr in tmpcat: | |
refcatbyid[ sr.getId() ] = sr | |
# Now go through the match, recording the SourceRecord | |
# in the master list, deleting entries from the dict as we go | |
# s2 can appear more than once | |
found = dict() | |
for s1, s2, d in matches: | |
refId = s1.getId() | |
s2Id = s2.getId() | |
matchbyid[ refId ].append( s2 ) | |
if s2Id in catbyid: | |
del catbyid[ s2Id ] | |
found[s2Id] = refId | |
else: | |
print("{} was matched twice ({})".format(s2Id, refId)) | |
for id in (refId, found[s2Id]): | |
if not id in dupes: | |
dupes[id] = [] | |
dupes[id].append( s2Id ) | |
# Indicate that we have processed the entry from refcat | |
del refcatbyid[ refId ] | |
print("{} items failed to match from cat B and {} from cat A".format(len(catbyid),len(refcatbyid))) | |
# Sources from the (filtered) reference catalog that failed to match should now | |
# have that faiure registered. Null-match should really be an object | |
# that looks like a SourceRecord but which has flux of zero and "bad" ID | |
# but which is associated with the relevant non-matching catalog. | |
# For now just log the index into the catalog, accounting for the missing ref catalog | |
for id in refcatbyid.keys(): | |
matchbyid[ id ].append( n+1 ) | |
# We now have a list of unmatched sources from the current catalog | |
# which will need to have the failing match status noted for the previous | |
# N catalogs. | |
for id, sr in catbyid.items(): | |
matches = [] | |
rcounter = 0 | |
for oldreg in prevreg: | |
if srcWithinRegion( sr, oldreg ): | |
matches.append( rcounter ) | |
rcounter = rcounter + 1 | |
matches.append( sr ) # Need to finish with this record | |
matchbyid[ id ] = matches | |
allmatches.append( matches ) | |
# Now add these unmatched items to the reference catalogue | |
refcat.extend( catbyid.values() ) | |
# catWithinRegion currently requires contiguous memory because | |
# it uses getRa() etc. That may not be a good idea but for now | |
# we need to make a deep copy of the catalog either here or in catWithinRegion | |
refcat = refcat.copy(deep=True) | |
# Store this region | |
prevreg.append( reg ) | |
print("Total size of catalogue: {}".format(len(allmatches))) | |
print("Duplicates during initial match:") | |
pp.pprint(dupes) | |
# Now do a self match on the result to find possible confusion | |
selfmatch = afwTable.matchRaDec( refcat, 1.5*match_radius, False ) | |
print("Self match got {} results".format(len(selfmatch))) | |
for s1,s2,dist in selfmatch: | |
d = dist * pal.DR2AS | |
print("{} self matches {} (d={} arcsec)".format(s1.getId(), s2.getId(),d)) | |
return (refcat, matchbyid, allmatches) | |
if __name__ == '__main__': | |
# Test with the obs_sdss stack demo data | |
root = "/Users/timj/work/lsst-builds/v10_0/lsst_dm_stack_demo/output" | |
butler = dafPersist.Butler(root) | |
# Get a set of catalogues for these filters | |
FILTER_MAP = {'u': 0, 'g':1, 'r': 2, 'i': 3, 'z': 4, 'y': 5} | |
filters ="ugriz" # "ugriz" are the full set in this case | |
run = 6377 | |
field = 399 | |
cat = [] | |
regions =[] | |
for filter in filters: | |
dataId = dict(run=run, field=field, filter=filter, camcol=4) | |
cat.append(butler.get('src', dataId)) | |
img = butler.get('calexp', dataId) | |
regions.append( exposureToRegion( img ) ) | |
# Now we have catalogs and regions we do the merge | |
(mergedcat, mergeddict, allmatches) = match_catalogs( cat, regions ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment