Skip to content

Instantly share code, notes, and snippets.

@clintval
Created February 14, 2020 06:05
Show Gist options
  • Select an option

  • Save clintval/72e78280f64a10db691f01f921bd1245 to your computer and use it in GitHub Desktop.

Select an option

Save clintval/72e78280f64a10db691f01f921bd1245 to your computer and use it in GitHub Desktop.
object SampleUtil {
/** Join all of the data across a collection of samples. All fields will be joined on the delimiter `";"`. Regardless
* of the lanes the libraries were sequenced on, the resulting sample will have the lanes field cleared to [[None]].
* The merged sample will have its ordinal set to zero.
*
* @throws IllegalArgumentException when there are no libraries to merge
* @throws IllegalArgumentException when trying to join samples with different sample names
*/
def merge(samples: Seq[Sample]): Sample = {
val sampleName = samples.map(_.sampleName) match {
case Seq() => throw new IllegalStateException("Must attempt to join at at least one sample.")
case Seq(head, tail @ _*) => {
require(tail.forall(_ == head), s"All libraries must be the same sample, found: ${samples.map(_.sampleName.mkString(", "))}")
head
}
}
/** Concatenate all of the values for field on a sample using a delimiter, return None if we end up with the empty string */
def concat(f: Sample => String): Option[String] = {
Option(samples.map(f).mkString(LibraryJoinDelimiter)).filter(_.trim.nonEmpty)
}
val sampleId = samples.map(_.sampleId).mkString(LibraryJoinDelimiter)
val libraryId = samples.map(_.libraryId).mkString(LibraryJoinDelimiter)
val project = concat(_.project.getOrElse(""))
val description = concat(_.description.getOrElse(""))
val i7IndexBases = concat(_.i7IndexBases.getOrElse(""))
val i5IndexBases = concat(_.i5IndexBases.getOrElse(""))
// Combine the extended attributes across all libraries while taking into account that not all libraries will have
// all of the extended attributes set. In the case when an extended attribute key cannot be found, ensure we add an
// empty string as a placeholder for that sample. Merge these keys in the same order of the input libraries
// collection.
val extendedKeys = samples.map(_.extendedAttributes.keySet).reduce(_ ++ _)
val extendedAttributes = extendedKeys.foldLeft(Map.empty[String, String]) { (acc, key: String) =>
samples.foldLeft(acc) { (acc, sample) =>
sample.extendedAttributes.get(key) match {
case None if acc.get(key).isEmpty => acc.updated(key, "")
case None if acc.get(key).nonEmpty => acc.updated(key, Seq(acc(key), "").mkString(LibraryJoinDelimiter))
case Some(_value) if acc.get(key).isEmpty => acc.updated(key, _value)
case Some(_value) if acc.get(key).nonEmpty => acc.updated(key, Seq(acc(key), _value).mkString(LibraryJoinDelimiter))
}
}
}
new Sample(
sampleOrdinal = 0,
sampleId = sampleId,
sampleName = sampleName,
libraryId = libraryId,
project = project,
description = description,
lane = None,
i7IndexBases = i7IndexBases,
i5IndexBases = i5IndexBases,
extendedAttributes = extendedAttributes
)
}
}
object SampleUtilTest {
private val LibraryOne: Sample = new Sample(
sampleOrdinal = 0,
sampleId = "1",
sampleName = "SamplePrime",
libraryId = "LibraryOne",
project = Some("project1"),
description = Some("Description 1"),
lane = Some(1),
i7IndexBases = Some("CAT"),
i5IndexBases = Some("TCA"),
extendedAttributes = Map(ReferenceNameKey.toUpperCase -> "mm10")
)
private val LibraryTwo: Sample = new Sample(
sampleOrdinal = 0,
sampleId = "2",
sampleName = "SamplePrime",
libraryId = "LibraryTwo",
project = Some("project1"),
description = Some("Description 2"),
lane = Some(2),
i7IndexBases = Some("TTT"),
i5IndexBases = Some("CCC"),
extendedAttributes = Map(ReferenceNameKey.toUpperCase -> "mm10", GenomeEquivalentsKey.toUpperCase -> "2000")
)
private val LibraryThree: Sample = new Sample(
sampleOrdinal = 0,
sampleId = "3",
sampleName = "SamplePrime",
libraryId = "LibraryThree",
project = Some("project1"),
description = Some("Description 3"),
lane = Some(5),
i7IndexBases = Some("AAA"),
i5IndexBases = Some("AAA"),
extendedAttributes = Map(ReferenceNameKey.toUpperCase -> "mm10")
)
"SampleUtil.merge" should "raise an exception when no libraries are defined" in {
an[IllegalStateException] shouldBe thrownBy { SampleUtil.merge(Seq.empty) }
}
it should "raise an exception when samples have different names" in {
val LibraryFromADifferentSample = LibraryOne.copy(sampleName = LibraryOne.sampleName + "-different")
an[IllegalArgumentException] shouldBe thrownBy { SampleUtil.merge(Seq(LibraryOne, LibraryFromADifferentSample))}
}
it should "return a single completely defined sample unchanged except for the sample ordinal and lane field" in {
val merged = LibraryOne.copy(sampleOrdinal = 0, lane = None)
SampleUtil.merge(Seq(LibraryOne)) shouldBe merged
}
it should "merge three libraries which have all of their fields set, and mismatching dictionaries" in {
val merged: Sample = new Sample(
sampleOrdinal = 0,
sampleId = "1;2;3",
sampleName = "SamplePrime",
libraryId = "LibraryOne;LibraryTwo;LibraryThree",
project = Some("project1;project1;project1"),
description = Some("Description 1;Description 2;Description 3"),
lane = None,
i7IndexBases = Some("CAT;TTT;AAA"),
i5IndexBases = Some("TCA;CCC;AAA"),
extendedAttributes = Map(ReferenceNameKey.toUpperCase -> "mm10;mm10;mm10", GenomeEquivalentsKey.toUpperCase -> ";2000;")
)
SampleUtil.merge(Seq(LibraryOne, LibraryTwo, LibraryThree)) shouldBe merged
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment