Created
March 11, 2012 15:35
-
-
Save ib84/2016844 to your computer and use it in GitHub Desktop.
hypergraphDB Text Experiments
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import test.JHGDB.TestCommons._ | |
import org.hypergraphdb.HGQuery.hg | |
import scala.collection.JavaConversions._ | |
import org.hypergraphdb.query.AtomTypeCondition | |
import test.Text.Word3 | |
import java.util.ArrayList | |
import org.hypergraphdb.indexing.{ByTargetIndexer, ByPartIndexer} | |
import org.hypergraphdb.{HyperGraph, HGHandle, HGPlainLink} | |
object HGDBRecursiveTextTest { | |
val graph:HyperGraph = initializeGraph("localhost", "6378"); | |
def main(args: Array[String]) { | |
// Part One | |
val word3word = graph.getTypeSystem().getTypeHandle(classOf[Word3]) | |
graph.getIndexManager().register(new ByPartIndexer(word3word, "word")) | |
val plainLinkHandle = graph.getTypeSystem().getTypeHandle(classOf[HGPlainLink]) | |
// graph.getIndexManager().register(new ByPartIndexer(plainLinkHandle, "word")) | |
graph.getIndexManager().register(new ByTargetIndexer(plainLinkHandle, 0)); | |
graph.runMaintenance(); | |
val loremIpsum = "Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquid ex ea commodi consequat. Quis aute iure reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint obcaecat cupiditat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." | |
val loremIpsumOrig = "... qui dolorem ipsum, quia dolor sit, amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt, ut labore et dolore magnam aliquam quaerat voluptatem. ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur? quis autem vel eum iure reprehenderit, qui in ea voluptate velit esse, quam nihil molestiae consequatur, vel illum, qui dolorem eum fugiat, quo voluptas nulla pariatur? At vero eos et accusamus et iusto odio dignissimos ducimus, qui blanditiis praesentium voluptatum deleniti atque corrupti, quos dolores et quas molestias excepturi sint, obcaecati cupiditate non provident, similique sunt in culpa, qui officia deserunt mollitia animi, id est laborum et dolorum fuga. " | |
val splitS = splitStringToWords(loremIpsum) | |
val splitT = splitStringToWords(loremIpsumOrig) | |
val wordIntersect = splitS.toSet.intersect(splitT.toSet); | |
val wordDiff : Set[String]= splitS.toSet[String].diff(splitT.toSet[String]); | |
val wordDiff2 : Set[String]= splitT.toSet[String].diff(splitS.toSet[String]); | |
val s2Handles = splitS.map(s => hg.assertAtom(graph, new Word3(s), word3word)).toArray[HGHandle] | |
val t2Handles = splitT.map(t => hg.assertAtom(graph, new Word3(t), word3word)).toArray[HGHandle] | |
val phrase1Handle = hg.addUnique(graph, new HGPlainLink(s2Handles: _*), plainLinkHandle, hg.link(java.util.Arrays.asList[HGHandle](s2Handles:_*)) ) | |
val phrase2Handle = hg.addUnique(graph, new HGPlainLink(t2Handles: _*), plainLinkHandle, hg.link(java.util.Arrays.asList[HGHandle](t2Handles:_*)) ) | |
// Second part | |
println("gimme all sentence containing the word \"sit\".\n") | |
hg.getAll(graph, hg.link(hg.assertAtom(graph, new Word3("sit"), word3word))).asInstanceOf[java.util.List[HGPlainLink]]. | |
foreach{link=> { print("\n"); link. | |
foreach{handle => print(" " + graph.get(handle).toString.replaceFirst ("KOMMA",",").replaceFirst("POINT","."))}}} | |
println("\nCount of words by AtomTypeCondition Word3") | |
println(graph.getAll(new AtomTypeCondition(classOf[Word3])).size) | |
println("end of test") | |
} | |
def splitStringToWords(s: String): Array[String] = s.replaceAllLiterally(", ", " KOMMA ").replaceAllLiterally(". ", " POINT ").replaceAllLiterally("? ", " QUESTIONMARK ").replaceAllLiterally("! ", " EXCLAMATION ").split(" ") | |
} |
ah! I forgot to mention, till now there is nothing recursive, but the original idea is that of a HGDB-Text-Typ that is composed of sequences of a) words and/or Text. To be continued.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a first attempt at creating a recursive Text Type in hyperGraphDB. The idea is simple: Split any String at " "into words and save single words into HGDB, making sure that a given word is stored and referenced by only a single handle (using hg.assertAtom and hg.addUnique). Sentences then only store sequences of handles to words. Sentences that have words in common, can then be queried using HGDB's indexing and query facilities to ask questions like this: "give me all sentences that contain the word xy". This is still all quite trivial. Checkout here for more details on what's planed: https://groups.google.com/d/topic/hypergraphdb/_fJ_5IMLjJU/discussion
btw: Word3 is simply a Java Bean having the string field "word" as required by HGDB.