Skip to content

Instantly share code, notes, and snippets.

@rolph-recto
Last active August 29, 2015 13:57
Show Gist options
  • Save rolph-recto/9593225 to your computer and use it in GitHub Desktop.
Save rolph-recto/9593225 to your computer and use it in GitHub Desktop.
document-system.ipynb
{
"metadata": {
"name": "Document System"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": "from document_system import DocumentSystem\nfrom document_fetcher import SqliteDocumentFetcher\nfrom document_corpus import DocumentCorpus\nfrom featureset import TfIdfFeatureset, SimilarityFeatureset, VACodeReferenceFeatureset\nfrom marginalia_emitter import SimilarityEmitter, KeywordEmitter, ReferenceEmitter",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": "#corpus\nfetcher = SqliteDocumentFetcher(\"vacode2.db\", count=1000, title=\"section\")\ncorpus = DocumentCorpus(fetcher)\ncorpus.build()\n\n#featureset\ntfidf_fs = TfIdfFeatureset()\nsimilarity_fs = SimilarityFeatureset(tfidf_fs)\nreference_fs = VACodeReferenceFeatureset()\n\n#emitter\nsimilarity_emitter = SimilarityEmitter(threshold=0.25)\nkeywords_emitter = KeywordEmitter(threshold=0.30)\nreference_emitter = ReferenceEmitter()\n\n#system\nsys = DocumentSystem(corpus, [tfidf_fs, similarity_fs, reference_fs], [similarity_emitter, keywords_emitter, reference_emitter])",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": true,
"input": "sys.build_featuresets()",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": "from serialize_marginalia import SQLMarginaliaSerializer\nfrom sqlalchemy import create_engine",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 5
},
{
"cell_type": "code",
"collapsed": false,
"input": "engine = create_engine(\"sqlite:///vacode2.db\", echo=True)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 7
},
{
"cell_type": "code",
"collapsed": false,
"input": "serializer = SQLMarginaliaSerializer(sys, engine)",
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 8
},
{
"cell_type": "code",
"collapsed": false,
"input": "serializer.serialize_corpus_marginalia()",
"language": "python",
"metadata": {},
"outputs": []
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment