timj · February 17, 2015 21:49
diff --git a/catstack.py b/catstack.py
 #!/usr/bin/env python

 """

 Purpose: Take a collection of catalogues and match sources

 For each matched source we need to know:

 1. The catalogue rows that were deemed to be the same source
 2. The catalogues that did not contain a specific source despite
   the areal coverage of the catalogue including the position.

 Algorithm
 ---------

 First catalogue is "reference".

 For each subsequent catalog:

 - Determine sky coverage of the new catalog
 - Subset reference catalog to only include sources
   within the same region
 - Do the cross match (allowing multiple matches within 1 arcsec)
 - + Append matches from new catalog to data structure indexed by reference source ID
   + Note IDs in reference (subset) that did not match anything in new
   + Note IDs in reference that had multiple matches in new
   + Append sources in the new catalog that did not match anything in reference to
     end of reference (and associate them with the knowledge that they were not seen
     in the previous N catalogs)
 - Repeat for next

 At this point we have one catalog with all the sources that were un-matched
 at some point, along with a dict with all the matches for each row in that
 catalog, and an indication of the catalogs the source was not seen in.

 The next step is to handle the cases where a catalog had multiple matches.
 This is where it gets complicated: it could be that one source split in to 2
 sources because seeing got better or resolution differences at different filters.

 If the reference catalog had one source that became two, there would only be
 a single source in the "reference" but it would have two separate sources
 in the match. If the two became one the "one" would be in the reference list
 twice.

 A self-match on the reference catalog would give us all the targets requiring
 more investigation when combined with the N-to-1 list.

 """


 from __future__ import (absolute_import, division,
                        print_function)
 import os
 import collections
 import pprint

 import lsst.daf.persistence as dafPersist
 import lsst.afw.geom as afwGeom
 import lsst.afw.table as afwTable

 import starlink.Ast as pyast
 import palpy as pal

 import numpy as np
 import matplotlib.pyplot as plt

 def exposureToRegion ( exposure ):
    """
    Given an Exposure object, calculate and return an AST region
    corresponding to the exposure area.
    """
    # We need the WCS
    wcs = exposure.getWcs()

    dims = exposure.getDimensions()
    # Get the RA/Dec of the corners
    xcoords = []
    ycoords = []
    # Polygon goes through corners counter clockwise
    for x,y in zip( (-0.5,dims[0]-0.5,dims[0]-0.5,-0.5), (-0.5, -0.5, dims[1]-0.5, dims[1]-0.5) ):
        corner = wcs.pixelToSky( x, y )
        icrs = corner.toIcrs()
        #print_coords(icrs[0], icrs[1])
        xcoords.append(icrs.getRa().asRadians())
        ycoords.append(icrs.getDec().asRadians())
    region = pyast.Polygon( pyast.SkyFrame( "system=ICRS" ), [xcoords, ycoords] )
    return region

 def catWithinRegion ( cat, region ):
    """
    Given a SourceCatalog and Region, return a new catalog containing
    sources in SourceCatalog that lie within Region.
    """
    k = cat.getCoordKey()

    # the AST tran method requires a 2xN numpy array
    # May need to make tran2() method public to avoid the temporary
    # numpy array.
    mapcoords = region.tran([ cat[k.getRa()], cat[k.getDec()] ])
    # Coordinates within the region will be the same.
    # Coordinates outside the region will be AST__BAD.
    # Only need one half the coordinates to check
    mask = ( mapcoords[0,:] != pyast.BAD )
    #print(cat[0].getId(), cat[mask][0].getId())
    return cat[mask].copy(deep=True)

 def srcWithinRegion( srcRecord, region ):
    """
    Given a SourceRecord, return True if the source record
    lies within the region, otherwise return False.
    """
    mapped = region.tran( [ [srcRecord.getRa()], [srcRecord.getDec()] ])
    return (mapped[0] != pyast.BAD)

 def print_coords(rra, rdec):
    """
    Given ra and dec in radians, print out in sexagesimal.

    Uses PAL for convenience. Could be done with pyast or presumably
    afw.
    """
    (sign, rh, rm, rs, rfrac) = pal.dr2tf( 3, rra )
    (sign, dd, dm, ds, dfrac) = pal.dr2af( 4, rdec )
    print("{}h{}m{}.{}s {}{}d{}m{}.{}s".format(rh, rm, rs,rfrac, sign, dd, dm, ds, dfrac ))


 def match_catalogs( catalogs, regions, match_radius=1*afwGeom.arcseconds ):
    """
    Given catalogs and associated region definitions defining
    the areal coverage of each catalog, merge them into a single
    catalog indicating all the cross-matches.

    Returns:

    - combined catalog with all distinct items
    - dict indexed by ID in combined catalog, containing
      a list of all items that matched that ID.
    - a list of lists containing all the matching source records
    """
    # We want to keep track of all the catalog entries
    # that have been matched. Use a collection of lists
    # where each list contains SourceRecord objects that match
    # This is similar to a Match object but can contain more than
    # two SourceRecords.
    allmatches = []
    matchbyid = collections.OrderedDict()

    pp = pprint.PrettyPrinter(indent=4)

    # need a copy of the supplied list so that we can modify it
    catalogs = catalogs[:]

    # First catalog is the reference
    # We seed the datastructure
    refcat = catalogs.pop(0)

    # Keep track of regions that we've already processed
    # starting with the first one (which is assumed to match refcat)
    prevreg = [ regions.pop(0) ]

    # Known duplicates, dict indexed by refcat ID
    dupes = dict()

    # Also fill dict to allow lookup by ID
    for s in refcat:
        seed = [ s ]
        allmatches.append( seed )
        matchbyid[ s.getId() ] = seed

    # Now we compare with each remaining catalog in turn
    for (n, c, reg) in zip(range(len(catalogs)), catalogs, regions):
        # Filter the reference catalog according to the region
        tmpcat = catWithinRegion( refcat, reg )

        # Now do the match between the current reference and the new catalog.
        # Return multiple matches if appropriate
        print("Ref catalog has {} items, new catalog has {}".format(len(tmpcat),len(c)))
        matches = afwTable.matchRaDec(tmpcat, c, match_radius, False)
        print ("{} matches out of {} and {}".format(len(matches), len(tmpcat), len(c)))

        # Create dict indexed by second catalog IDs so we can easily
        # keep track of non-matches
        catbyid = collections.OrderedDict()
        for sr in c:
            catbyid[ sr.getId() ] = sr

        # Also keep track of the reference catalog IDs so that we can keep
        # track of entries that did not match
        refcatbyid = collections.OrderedDict()
        for sr in tmpcat:
            refcatbyid[ sr.getId() ] = sr

        # Now go through the match, recording the SourceRecord
        # in the master list, deleting entries from the dict as we go
        # s2 can appear more than once
        found = dict()
        for s1, s2, d in matches:
            refId = s1.getId()
            s2Id = s2.getId()
            matchbyid[ refId ].append( s2 )
            if s2Id in catbyid:
                del catbyid[ s2Id ]
                found[s2Id] = refId
            else:
                print("{} was matched twice ({})".format(s2Id, refId))
                for id in (refId, found[s2Id]):
                    if not id in dupes:
                        dupes[id] = []
                    dupes[id].append( s2Id )
            # Indicate that we have processed the entry from refcat
            del refcatbyid[ refId ]

        print("{} items failed to match from cat B and {} from cat A".format(len(catbyid),len(refcatbyid)))

        # Sources from the (filtered) reference catalog that failed to match should now
        # have that faiure registered. Null-match should really be an object
        # that looks like a SourceRecord but which has flux of zero and "bad" ID
        # but which is associated with the relevant non-matching catalog.
        # For now just log the index into the catalog, accounting for the missing ref catalog
        for id in refcatbyid.keys():
            matchbyid[ id ].append( n+1 )

        # We now have a list of unmatched sources from the current catalog
        # which will need to have the failing match status noted for the previous
        # N catalogs.
        for id, sr in catbyid.items():
            matches = []
            rcounter = 0
            for oldreg in prevreg:
                if srcWithinRegion( sr, oldreg ):
                    matches.append( rcounter )
                rcounter = rcounter + 1
            matches.append( sr )  # Need to finish with this record
            matchbyid[ id ] = matches
            allmatches.append( matches )

        # Now add these unmatched items to the reference catalogue
        refcat.extend( catbyid.values() )

        # catWithinRegion currently requires contiguous memory because
        # it uses getRa() etc. That may not be a good idea but for now
        # we need to make a deep copy of the catalog either here or in catWithinRegion
        refcat = refcat.copy(deep=True)

        # Store this region
        prevreg.append( reg )

    print("Total size of catalogue: {}".format(len(allmatches)))

    print("Duplicates during initial match:")
    pp.pprint(dupes)

    # Now do a self match on the result to find possible confusion
    selfmatch = afwTable.matchRaDec( refcat, 1.5*match_radius, False )
    print("Self match got {} results".format(len(selfmatch)))

    for s1,s2,dist in selfmatch:
        d = dist * pal.DR2AS
        print("{} self matches {} (d={} arcsec)".format(s1.getId(), s2.getId(),d))

    return (refcat, matchbyid, allmatches)

 if __name__ == '__main__':

    # Test with the obs_sdss stack demo data
    root = "/Users/timj/work/lsst-builds/v10_0/lsst_dm_stack_demo/output"

    butler = dafPersist.Butler(root)

    # Get a set of catalogues for these filters
    FILTER_MAP = {'u': 0, 'g':1, 'r': 2, 'i': 3, 'z': 4, 'y': 5}
    filters ="ugriz"  # "ugriz" are the full set in this case

    run = 6377
    field = 399

    cat = []
    regions =[]

    for filter in filters:
        dataId = dict(run=run, field=field, filter=filter, camcol=4)
        cat.append(butler.get('src', dataId))
        img = butler.get('calexp', dataId)
        regions.append( exposureToRegion( img ) )

    # Now we have catalogs and regions we do the merge
    (mergedcat, mergeddict, allmatches) = match_catalogs( cat, regions )
	#!/usr/bin/env python

	"""

	Purpose: Take a collection of catalogues and match sources

	For each matched source we need to know:

	1. The catalogue rows that were deemed to be the same source
	2. The catalogues that did not contain a specific source despite
	the areal coverage of the catalogue including the position.

	Algorithm
	---------

	First catalogue is "reference".

	For each subsequent catalog:

	- Determine sky coverage of the new catalog
	- Subset reference catalog to only include sources
	within the same region
	- Do the cross match (allowing multiple matches within 1 arcsec)
	- + Append matches from new catalog to data structure indexed by reference source ID
	+ Note IDs in reference (subset) that did not match anything in new
	+ Note IDs in reference that had multiple matches in new
	+ Append sources in the new catalog that did not match anything in reference to
	end of reference (and associate them with the knowledge that they were not seen
	in the previous N catalogs)
	- Repeat for next

	At this point we have one catalog with all the sources that were un-matched
	at some point, along with a dict with all the matches for each row in that
	catalog, and an indication of the catalogs the source was not seen in.

	The next step is to handle the cases where a catalog had multiple matches.
	This is where it gets complicated: it could be that one source split in to 2
	sources because seeing got better or resolution differences at different filters.

	If the reference catalog had one source that became two, there would only be
	a single source in the "reference" but it would have two separate sources
	in the match. If the two became one the "one" would be in the reference list
	twice.

	A self-match on the reference catalog would give us all the targets requiring
	more investigation when combined with the N-to-1 list.

	"""


	from __future__ import (absolute_import, division,
	print_function)
	import os
	import collections
	import pprint

	import lsst.daf.persistence as dafPersist
	import lsst.afw.geom as afwGeom
	import lsst.afw.table as afwTable

	import starlink.Ast as pyast
	import palpy as pal

	import numpy as np
	import matplotlib.pyplot as plt

	def exposureToRegion ( exposure ):
	"""
	Given an Exposure object, calculate and return an AST region
	corresponding to the exposure area.
	"""
	# We need the WCS
	wcs = exposure.getWcs()

	dims = exposure.getDimensions()
	# Get the RA/Dec of the corners
	xcoords = []
	ycoords = []
	# Polygon goes through corners counter clockwise
	for x,y in zip( (-0.5,dims[0]-0.5,dims[0]-0.5,-0.5), (-0.5, -0.5, dims[1]-0.5, dims[1]-0.5) ):
	corner = wcs.pixelToSky( x, y )
	icrs = corner.toIcrs()
	#print_coords(icrs[0], icrs[1])
	xcoords.append(icrs.getRa().asRadians())
	ycoords.append(icrs.getDec().asRadians())
	region = pyast.Polygon( pyast.SkyFrame( "system=ICRS" ), [xcoords, ycoords] )
	return region

	def catWithinRegion ( cat, region ):
	"""
	Given a SourceCatalog and Region, return a new catalog containing
	sources in SourceCatalog that lie within Region.
	"""
	k = cat.getCoordKey()

	# the AST tran method requires a 2xN numpy array
	# May need to make tran2() method public to avoid the temporary
	# numpy array.
	mapcoords = region.tran([ cat[k.getRa()], cat[k.getDec()] ])
	# Coordinates within the region will be the same.
	# Coordinates outside the region will be AST__BAD.
	# Only need one half the coordinates to check
	mask = ( mapcoords[0,:] != pyast.BAD )
	#print(cat[0].getId(), cat[mask][0].getId())
	return cat[mask].copy(deep=True)

	def srcWithinRegion( srcRecord, region ):
	"""
	Given a SourceRecord, return True if the source record
	lies within the region, otherwise return False.
	"""
	mapped = region.tran( [ [srcRecord.getRa()], [srcRecord.getDec()] ])
	return (mapped[0] != pyast.BAD)

	def print_coords(rra, rdec):
	"""
	Given ra and dec in radians, print out in sexagesimal.

	Uses PAL for convenience. Could be done with pyast or presumably
	afw.
	"""
	(sign, rh, rm, rs, rfrac) = pal.dr2tf( 3, rra )
	(sign, dd, dm, ds, dfrac) = pal.dr2af( 4, rdec )
	print("{}h{}m{}.{}s {}{}d{}m{}.{}s".format(rh, rm, rs,rfrac, sign, dd, dm, ds, dfrac ))


	def match_catalogs( catalogs, regions, match_radius=1*afwGeom.arcseconds ):
	"""
	Given catalogs and associated region definitions defining
	the areal coverage of each catalog, merge them into a single
	catalog indicating all the cross-matches.

	Returns:

	- combined catalog with all distinct items
	- dict indexed by ID in combined catalog, containing
	a list of all items that matched that ID.
	- a list of lists containing all the matching source records
	"""
	# We want to keep track of all the catalog entries
	# that have been matched. Use a collection of lists
	# where each list contains SourceRecord objects that match
	# This is similar to a Match object but can contain more than
	# two SourceRecords.
	allmatches = []
	matchbyid = collections.OrderedDict()

	pp = pprint.PrettyPrinter(indent=4)

	# need a copy of the supplied list so that we can modify it
	catalogs = catalogs[:]

	# First catalog is the reference
	# We seed the datastructure
	refcat = catalogs.pop(0)

	# Keep track of regions that we've already processed
	# starting with the first one (which is assumed to match refcat)
	prevreg = [ regions.pop(0) ]

	# Known duplicates, dict indexed by refcat ID
	dupes = dict()

	# Also fill dict to allow lookup by ID
	for s in refcat:
	seed = [ s ]
	allmatches.append( seed )
	matchbyid[ s.getId() ] = seed

	# Now we compare with each remaining catalog in turn
	for (n, c, reg) in zip(range(len(catalogs)), catalogs, regions):
	# Filter the reference catalog according to the region
	tmpcat = catWithinRegion( refcat, reg )

	# Now do the match between the current reference and the new catalog.
	# Return multiple matches if appropriate
	print("Ref catalog has {} items, new catalog has {}".format(len(tmpcat),len(c)))
	matches = afwTable.matchRaDec(tmpcat, c, match_radius, False)
	print ("{} matches out of {} and {}".format(len(matches), len(tmpcat), len(c)))

	# Create dict indexed by second catalog IDs so we can easily
	# keep track of non-matches
	catbyid = collections.OrderedDict()
	for sr in c:
	catbyid[ sr.getId() ] = sr

	# Also keep track of the reference catalog IDs so that we can keep
	# track of entries that did not match
	refcatbyid = collections.OrderedDict()
	for sr in tmpcat:
	refcatbyid[ sr.getId() ] = sr

	# Now go through the match, recording the SourceRecord
	# in the master list, deleting entries from the dict as we go
	# s2 can appear more than once
	found = dict()
	for s1, s2, d in matches:
	refId = s1.getId()
	s2Id = s2.getId()
	matchbyid[ refId ].append( s2 )
	if s2Id in catbyid:
	del catbyid[ s2Id ]
	found[s2Id] = refId
	else:
	print("{} was matched twice ({})".format(s2Id, refId))
	for id in (refId, found[s2Id]):
	if not id in dupes:
	dupes[id] = []
	dupes[id].append( s2Id )
	# Indicate that we have processed the entry from refcat
	del refcatbyid[ refId ]

	print("{} items failed to match from cat B and {} from cat A".format(len(catbyid),len(refcatbyid)))

	# Sources from the (filtered) reference catalog that failed to match should now
	# have that faiure registered. Null-match should really be an object
	# that looks like a SourceRecord but which has flux of zero and "bad" ID
	# but which is associated with the relevant non-matching catalog.
	# For now just log the index into the catalog, accounting for the missing ref catalog
	for id in refcatbyid.keys():
	matchbyid[ id ].append( n+1 )

	# We now have a list of unmatched sources from the current catalog
	# which will need to have the failing match status noted for the previous
	# N catalogs.
	for id, sr in catbyid.items():
	matches = []
	rcounter = 0
	for oldreg in prevreg:
	if srcWithinRegion( sr, oldreg ):
	matches.append( rcounter )
	rcounter = rcounter + 1
	matches.append( sr ) # Need to finish with this record
	matchbyid[ id ] = matches
	allmatches.append( matches )

	# Now add these unmatched items to the reference catalogue
	refcat.extend( catbyid.values() )

	# catWithinRegion currently requires contiguous memory because
	# it uses getRa() etc. That may not be a good idea but for now
	# we need to make a deep copy of the catalog either here or in catWithinRegion
	refcat = refcat.copy(deep=True)

	# Store this region
	prevreg.append( reg )

	print("Total size of catalogue: {}".format(len(allmatches)))

	print("Duplicates during initial match:")
	pp.pprint(dupes)

	# Now do a self match on the result to find possible confusion
	selfmatch = afwTable.matchRaDec( refcat, 1.5*match_radius, False )
	print("Self match got {} results".format(len(selfmatch)))

	for s1,s2,dist in selfmatch:
	d = dist * pal.DR2AS
	print("{} self matches {} (d={} arcsec)".format(s1.getId(), s2.getId(),d))

	return (refcat, matchbyid, allmatches)

	if __name__ == '__main__':

	# Test with the obs_sdss stack demo data
	root = "/Users/timj/work/lsst-builds/v10_0/lsst_dm_stack_demo/output"

	butler = dafPersist.Butler(root)

	# Get a set of catalogues for these filters
	FILTER_MAP = {'u': 0, 'g':1, 'r': 2, 'i': 3, 'z': 4, 'y': 5}
	filters ="ugriz" # "ugriz" are the full set in this case

	run = 6377
	field = 399

	cat = []
	regions =[]

	for filter in filters:
	dataId = dict(run=run, field=field, filter=filter, camcol=4)
	cat.append(butler.get('src', dataId))
	img = butler.get('calexp', dataId)
	regions.append( exposureToRegion( img ) )

	# Now we have catalogs and regions we do the merge
	(mergedcat, mergeddict, allmatches) = match_catalogs( cat, regions )