- 实词:名词、动词、形容词、状态词、区别词、数词、量词、代词
- 虚词:副词、介词、连词、助词、拟声词、叹词。
n 名词
nr 人名
# Initialize the scroll | |
page = es.search( | |
index = 'yourIndex', | |
doc_type = 'yourType', | |
scroll = '2m', | |
search_type = 'scan', | |
size = 1000, | |
body = { | |
# Your query's body | |
}) |
<?xml version="1.0" encoding="UTF-8" standalone="no"?> | |
<?pde version="3.8"?><target name="simple" sequenceNumber="12"> | |
<locations> | |
<location path="${env_var:ECLIPSE_432_HOME}" type="Profile"/> | |
<location path="${project_loc:builder_external}/builder/lib" type="Directory"/> | |
<location includeAllPlatforms="false" includeConfigurePhase="true" includeMode="planner" includeSource="true" type="InstallableUnit"> | |
<unit id="org.apache.commons.beanutils" version="1.8.0.v201205091237"/> | |
<unit id="org.apache.commons.collections" version="3.2.0.v2013030210310"/> | |
<unit id="com.google.guava" version="12.0.0.v201212092141"/> | |
<unit id="com.google.gson" version="2.1.0.v201303041604"/> |
"""A simple implementation of a greedy transition-based parser. Released under BSD license.""" | |
from os import path | |
import os | |
import sys | |
from collections import defaultdict | |
import random | |
import time | |
import pickle | |
SHIFT = 0; RIGHT = 1; LEFT = 2; |
mahout clusterdump \ | |
-dt sequencefile \ # format: {Integer => String} | |
-d reuters-vectors/dictionary.file-* \ # dictionary: {id => word} | |
-i reuters-kmeans-clusters/clusters-3-final \ # input | |
-o clusters.txt \ # output (local filesystem) | |
-b 10 \ # format length | |
-n 10 # number of top terms to print | |
--distanceMeasure org.apache.mahout.common.distance.CosineDistanceMeasure # default is euclidean distance |
#A Collection of NLP notes
##N-grams
###Calculating unigram probabilities:
P( wi ) = count ( wi ) ) / count ( total number of words )
In english..
git fetch upstream | |
git reset --hard upstream/master |