Last active
December 22, 2015 19:39
-
-
Save renaud/6521003 to your computer and use it in GitHub Desktop.
Mallet MaxEnt classifier for paper references
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package ch.epfl.bbp.uima.projects.references; | |
| import static cc.mallet.pipe.iterator.FileIterator.LAST_DIRECTORY; | |
| import static com.google.common.collect.Lists.newArrayList; | |
| import static java.util.regex.Pattern.compile; | |
| import static org.apache.commons.lang.StringUtils.join; | |
| import static org.slf4j.LoggerFactory.getLogger; | |
| import java.io.File; | |
| import java.io.FileFilter; | |
| import java.io.FileInputStream; | |
| import java.io.InputStream; | |
| import java.util.Iterator; | |
| import java.util.List; | |
| import java.util.logging.LogManager; | |
| import org.slf4j.Logger; | |
| import cc.mallet.classify.Classifier; | |
| import cc.mallet.classify.ClassifierTrainer; | |
| import cc.mallet.classify.MaxEntTrainer; | |
| import cc.mallet.classify.Trial; | |
| import cc.mallet.pipe.CharSequence2TokenSequence; | |
| import cc.mallet.pipe.FeatureSequence2FeatureVector; | |
| import cc.mallet.pipe.Input2CharSequence; | |
| import cc.mallet.pipe.Pipe; | |
| import cc.mallet.pipe.PrintInputAndTarget; | |
| import cc.mallet.pipe.SerialPipes; | |
| import cc.mallet.pipe.Target2Label; | |
| import cc.mallet.pipe.TokenSequence2FeatureSequence; | |
| import cc.mallet.pipe.iterator.FileIterator; | |
| import cc.mallet.pipe.tsf.RegexMatches; | |
| import cc.mallet.share.upenn.ner.LongRegexMatches; | |
| import cc.mallet.types.Instance; | |
| import cc.mallet.types.InstanceList; | |
| import cc.mallet.types.Label; | |
| import cc.mallet.types.Labeling; | |
| import cc.mallet.util.CharSequenceLexer; | |
| import cc.mallet.util.Randoms; | |
| public class ReferencesClassifier2 { | |
| private static Logger LOG = getLogger(ReferencesClassifier2.class); | |
| private final static File root = new File("referenceClassifier/corpus/"); | |
| private final static int trials = 30; | |
| public static void main(String[] args) { | |
| // pipe instances | |
| InstanceList instanceList = new InstanceList( | |
| new SerialPipes(getPipes())); | |
| FileIterator iterator = new FileIterator(new File[] { root }, | |
| new TxtFilter(), LAST_DIRECTORY); | |
| instanceList.addThruPipe(iterator); | |
| // cross-validate | |
| System.out.println("trial\tprec\trecall\tF-score"); | |
| double f1s = 0; | |
| for (int i = 0; i < trials; i++) { | |
| Trial trial = testTrainSplit(instanceList); | |
| System.out.println(join(new Object[] {// | |
| i, trial.getPrecision(TESTING), trial.getRecall(TESTING), | |
| trial.getF1(TESTING) }, "\t")); | |
| f1s += trial.getF1(TESTING); | |
| } | |
| System.out.println("mean F1 = " + (f1s / (trials + 0d))); | |
| } | |
| static List<Pipe> getPipes() { | |
| List<Pipe> pipes = newArrayList(); | |
| pipes.add(new Target2Label()); | |
| pipes.add(new Input2CharSequence()); | |
| pipes.add(new CharSequence2TokenSequence( | |
| CharSequenceLexer.LEX_NONWHITESPACE_TOGETHER)); | |
| addMyPipes(pipes); | |
| // pipes.add(new DummyPipe()); | |
| pipes.add(new PrintInputAndTarget()); | |
| pipes.add(new TokenSequence2FeatureSequence()); | |
| pipes.add(new FeatureSequence2FeatureVector()); | |
| return pipes; | |
| } | |
| private static void addMyPipes(List<Pipe> pipes) { | |
| // YEARS | |
| pipes.add(new RegexMatches("years",// | |
| compile(".*(19[56789]\\d|20[01]\\d).*"))); | |
| pipes.add(new RegexMatches("years_abcd",// | |
| compile(".*(19[56789]\\d[abcd]|20[01]\\d[abcd]).*"))); | |
| pipes.add(new RegexMatches("years_parenthesis",// | |
| compile(".*(\\(19[56789]\\d|20[01]\\d\\)).*"))); | |
| // 385-420 | |
| pipes.add(new LongRegexMatches("volume",// | |
| compile(".*(\\d+ ?[–-] ?\\d+).*"), 1, 5)); | |
| // Comp. Neurol. 167: 385-420 | |
| pipes.add(new LongRegexMatches("volume_more",// | |
| compile(".*(\\d+: ?\\d{1,4} ?[–-] ?\\d{1,4}).*"), 1, 10)); | |
| // pages | |
| pipes.add(new LongRegexMatches("pages",// | |
| compile(".*(p.? \\d+ [–-] \\\\d+).*"), 1, 5)); | |
| // Gurdjian, E. S. | |
| pipes.add(new LongRegexMatches("author1",// | |
| compile(".*([A-Z]\\w+, [A-Z]\\.).*"), 1, 10)); | |
| // Beckstead RM (1979) | |
| pipes.add(new LongRegexMatches("author2",// | |
| compile(".*([A-Z]\\w+ [A-Z][A-Z ,]).*"), 1, 10)); | |
| // Newman, R., and S. S. Winans | |
| pipes.add(new LongRegexMatches("author3",// | |
| compile(".*(, and [A-Z]\\. [A-Z]).*"), 1, 10)); | |
| // repetitions: Boussaoud D, Ungerleider LC, Desimone R | |
| pipes.add(new LongRegexMatches("author4",// | |
| compile(".*((, [A-Z]\\w+ [A-Z]{1,2}){2,}).*"), 1, 10)); | |
| // , {comma, name} | |
| pipes.add(new LongRegexMatches("author5",// | |
| compile(".*(, [A-Z]\\w+ [A-Z]).*"), 1, 10)); | |
| // 4 Brodmann, K., V | |
| // 17. Sorensen OW, | |
| pipes.add(new LongRegexMatches("author6",// | |
| compile(".*(\\d{1,2}\\.? [A-Z]\\w+,? [A-Z]).*"), 1, 10)); | |
| // NEGATIVE EXAMPLES | |
| // at least 3 uppercase letters | |
| pipes.add(new RegexMatches("neg_3uppercase", compile(".*([A-Z]{3,}).*"))); | |
| // (Beckstead RM <-- parenthesis! | |
| pipes.add(new LongRegexMatches("neg_author_parenthesis",// | |
| compile(".*(\\([A-Z]\\w+ [A-Z][A-Z ,]).*"), 1, 10)); | |
| // Gurdjian, E <-- parenthesis! | |
| pipes.add(new LongRegexMatches("neg_author_parenthesis2",// | |
| compile(".*(\\([A-Z]\\w+, [A-Z]).*"), 1, 10)); | |
| // ng (Rosenmund et al., 1998; Smith and Howe, 2000), a | |
| pipes.add(new LongRegexMatches("neg_inline_ref",// | |
| compile(".*(\\([A-Z]\\w+.{3,40}\\d+\\)).*"), 1, 10)); | |
| } | |
| static final int TRAINING = 0, TESTING = 1; | |
| public static Trial testTrainSplit(InstanceList instances) { | |
| InstanceList[] instanceLists = instances.split(new Randoms(), | |
| new double[] { 0.9, 0.1, 0.0 }); | |
| // LOG.debug("{} training instance, {} testing instances", | |
| // instanceLists[0].size(), instanceLists[1].size()); | |
| @SuppressWarnings("rawtypes") | |
| ClassifierTrainer trainer = new MaxEntTrainer(); | |
| Classifier classifier = trainer.train(instanceLists[TRAINING]); | |
| return new Trial(classifier, instanceLists[TESTING]); | |
| } | |
| static class TxtFilter implements FileFilter { | |
| public boolean accept(File file) { | |
| return file.toString().endsWith(".txt"); | |
| } | |
| } | |
| } |
We can make this file beautiful and searchable if this error is corrected: It looks like row 2 should actually have 3 columns, instead of 4 in line 1.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| trial prec recall F-score | |
| 0 0.9090909090909091 0.9580838323353293 0.932944606413994 | |
| 1 0.9467455621301775 0.963855421686747 0.9552238805970149 | |
| 2 0.9302325581395349 0.9696969696969697 0.9495548961424333 | |
| 3 0.8944444444444445 0.9640718562874252 0.9279538904899136 | |
| 4 0.9117647058823529 0.950920245398773 0.9309309309309309 | |
| 5 0.9265536723163842 0.9704142011834319 0.9479768786127167 | |
| 6 0.9310344827586207 0.9364161849710982 0.9337175792507204 | |
| 7 0.8633879781420765 0.9634146341463414 0.9106628242074928 | |
| 8 0.9425287356321839 0.9879518072289156 0.9647058823529412 | |
| 9 0.8781725888324873 0.9885714285714285 0.9301075268817204 | |
| 10 0.95625 0.95625 0.95625 | |
| 11 0.9479768786127167 0.9820359281437125 0.9647058823529412 | |
| 12 0.9207317073170732 0.9741935483870968 0.9467084639498433 | |
| 13 0.9056603773584906 0.935064935064935 0.9201277955271565 | |
| 14 0.8978494623655914 0.9766081871345029 0.9355742296918768 | |
| 15 0.9213483146067416 0.9534883720930233 0.9371428571428573 | |
| 16 0.9473684210526315 0.9642857142857143 0.9557522123893805 | |
| 17 0.9447513812154696 0.9771428571428571 0.9606741573033707 | |
| 18 0.8982035928143712 0.9493670886075949 0.923076923076923 | |
| 19 0.8715083798882681 0.975 0.9203539823008849 | |
| 20 0.9239766081871345 0.9753086419753086 0.9489489489489489 | |
| 21 0.9142857142857143 0.975609756097561 0.9439528023598821 | |
| 22 0.907608695652174 0.9653179190751445 0.9355742296918768 | |
| 23 0.9186046511627907 0.9518072289156626 0.9349112426035502 | |
| 24 0.9502762430939227 0.9717514124293786 0.9608938547486033 | |
| 25 0.9270833333333334 0.9888888888888889 0.9569892473118279 | |
| 26 0.8705882352941177 0.961038961038961 0.9135802469135802 | |
| 27 0.927710843373494 0.9746835443037974 0.9506172839506173 | |
| 28 0.9542857142857143 0.9766081871345029 0.9653179190751444 | |
| 29 0.8994708994708994 0.9497206703910615 0.9239130434782608 | |
| mean F1 = 0.9412948072899134 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Kitagawa, T., Tanaka, M., and Akamatsu, Y. (1989) Biochim. | |
| Biophys. Acta 980, 100-108 | |
| Gerhart, D. Z., LeVasseur, R. J., Broderuis, M. A., and Drewes, | |
| L. R. (1989) J. Neurosci. Res. 22, 464-472 34. | |
| 19. Froehner, S. C., Davies, A., Baldwin, S. A., and Lienhard, G. E. | |
| (1988) J. Neurocytol. 1'7, 173-178 35. | |
| 20. Flier, J. S., Mueckler, M., McCall, A. L., and Lodish, H. F. (1987) | |
| J. Clin. Inuest. 79, 657-661 36. | |
| 21. Boado, R. J., and Pardridge, W. M. (1990) Biochem. Biophys. | |
| Res. Commun. 166,174-179 37. | |
| 22. Pardridae. W. M., Yang, J., Eisenherg, J., and Mietus. L. J. (1986) | |
| J. Cereb. Blood.Flow%etab. 6, 203-211 38. | |
| 23. Pardridee. W. M.. Yane. J.. and Eisenbere. -, J. (1985) J. Neuro-&em.-4'5,1141-1147-' 39. | |
| 24. Pardridge, W. M., Eisenberg, J., and Yang, J. (1985) J. Neurothem. 44,1771-1778 40. | |
| 25. Lowry, 0. H., Rosebrough, N. J., Farr, A. L., and Randall, R. J. | |
| (1951) J. Biol. Chem. 193, 265-275 41. | |
| 26. Haspel, H. C., Rosenfeld, M. G., and Rosen, 0. M. (1988) J. Biol. 42. | |
| Cheh.263; 398-403 | |
| 27. Pardridee. W. M.. Trieuero. D.. andFarrell. C. R. (1990) Diabetes. 43. | |
| 39,1040-1044' - ' 44. | |
| 28. Baldwin, S. A., and Lienhard, G. E. (1989) Methods Enzymol. | |
| 174,39-50 | |
| 29. Sogin, D. C., and Hinkle, P. C. (1980) Proc. N&l. Acud. Sci. U. 45. | |
| S. A. 77, 5725-5729 | |
| 30. Sivitz, W., DeSautel, S., Walker, P. S., and Pessin, J. E. (1989) 46. | |
| Endocrinology 124, 1875-1880 | |
| 31. Matthaei, S., Horuk, R., and Olefsky, J. M. (1986) Diabetes 35, | |
| 1181-1184 47. | |
| 32. Gorga, A. R., and Lienhard, G. E. (1981) Biochemistry 20,5108-5113 48. | |
| Angerer, L. M., Stoler, M. H., and Angerer, R. C. (1987) in In |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| were found in CCK basket and Schaffer-collateral associated cells | |
| (Cea-del Rio et al., 2010, 2011), as well as in SOM expressing | |
| O-LM interneurons (Lawrence et al., 2006a). This latter study | |
| showed the emergence of the ADP was dependent on inhibition of the M-current and a slow calcium-activated potassium | |
| channel, as well as activation of calcium-dependent non-selective | |
| cationic current (ICAN). In contrast, the membrane potential | |
| depolarization and increased firing frequency in CA1 PV basket | |
| cells require activation of M1 mAChRs (Cea-del Rio et al., 2010, | |
| 2011). | |
| Immunocytochemical studies have shown differential expression of muscarinic receptor subtypes within the whole brain | |
| (Levey et al., 1991) and hippocampus (Levey et al., 1995; Hájos | |
| et al., 1998). Most notably, in the hippocampus M2 mAChRs | |
| are densely expressed on axon terminals of PV basket and axoaxonic cells, while in dendritic targeting interneurons (calretinin | |
| and SOM) M2 mAChRs are expressed on the soma and dendrites | |
| (Hájos et al., 1998). Similarly, M2 mAChRs are expressed on PV | |
| basket cells of the auditory cortex (Salgado et al., 2007) and EC | |
| (Chaudhuri et al., 2005). Activation of the M2 mAChRs has been | |
| shown to decrease the amplitude of unitary IPSPs in CA3 pyramidal cells (Szabó et al., 2010), IPSCs in pyramidal cells of the | |
| auditory cortex (Salgado et al., 2007) and IPSPs in pyramidal and | |
| SCs of the EC (Apergis-Schoute et al., 2007). These data strongly | |
| support a heterosynaptic regulatory role for M2 mAChRs, where | |
| their activation on interneurons inhibits the synaptic release of | |
| GABA and decreases inhibitory potentials in pyramidal cells. | |
| In contrast, M1 mAChRs and M3 mAChRs activation have a | |
| predominantly excitatory effect on interneurons, increasing levels of inhibition in principal cells. Interesting questions remain | |
| unanswered within the EC. For instance, in what interneurons | |
| are muscarinic receptors other than M2 expressed, and are they | |
| localized in particular cellular compartments? Can muscarinic | |
| receptor expression be correlated with interneuron innervation of | |
| particular principal cell domains, and do the same neurochemical markers (CCK, PV, SOM, neuropeptide Y (NPY), calbindin, | |
| calretinin, etc.) correlate with similar innervation of principal cell | |
| domains as seen in the hippocampus (reviewed by Freund and | |
| Buzsáki, 1996)? | |
| Van der Zee and colleagues have contributed extensively to | |
| mAChR immunocytochemistry of the hippocampus, neocortex, | |
| and amygdala (reviewed in Van der Zee et al., 1999). In the | |
| interest of space, here we will focus on their data concerning | |
| the hippocampus. Their body of work uses M35, a pan-mAChR | |
| antibody which labels all muscarinic receptors that are in an | |
| activated state (André et al., 1984). Therefore, M35 immunoreactivity (ir) allows visualization of phosphorylated/internalilzed | |
| mAChRs and can be used as a tool to investigate the functional | |
| cholinergic properties of a cell or network. Immuncytochemical | |
| studies in naïve animals have found M35-ir in basket cells within | |
| stratum pyramidale of CA1-CA3 as well as other interneuron | |
| types in SLM and stratum oriens/alveus (Van der Zee et al., | |
| 1989, 1991a). Similar studies found a high degree of colocalization between M35 and the GABA markers PV and SOM | |
| (Van der Zee et al., 1991a,b, 1993). Out of 2730 hippocampal | |
| interneurons expressing mAChRs, 33% colocalized with SOM, | |
| 52% with PV, and 72.8% with SOM and/or PV. Furthermore, |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment