Created
May 15, 2018 07:35
-
-
Save yycking/afb8d3bb4b29bc38431b0ca218296240 to your computer and use it in GitHub Desktop.
自然語言處理 NSLinguisticTagger
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
語言辨識:zh-Hant | |
斷詞:["戚戚", "堆積", "到", "淒慘", "得", "急", "憔悴", "冷冷清清", "傷心", "這", "也", "點點滴滴", "將息", "杯", "淡", "時候", "摘", "慘", "黃", "酒", "獨自", "最", "他", "有", "淒", "怎", "正", "梧桐", "難", "更", "字", "窗兒", "守", "風", "是", "如今", "兩", "乍暖還寒", "盞", "次第", "個", "尋尋覓覓", "堪", "舊時", "過", "愁", "怎生", "了得", "敵", "來", "相識", "花", "誰", "雁", "著", "黑", "卻", "晚", "黃昏", "細雨", "三", "滿地", "兼", "一", "損"] | |
詞性標記 | |
詞形還原 | |
具名實體辨識 | |
語言辨識:en | |
斷詞:["no", "I", "my", "\'ll", "wilt", "Capulet", "thou", "not", "if", "be", "and", "but", "Romeo", "love", "name", "thy", "wherefore", "longer", "O", "art", "a", "And", "father", "Deny", "Or", "sworn", "refuse"] | |
詞性標記 | |
no:Adverb | |
I:Pronoun | |
my:Determiner | |
'll:Verb | |
wilt:Noun | |
Capulet:Noun | |
thou:Determiner | |
not:Adverb | |
if:Preposition | |
be:Verb | |
and:Conjunction | |
but:Conjunction | |
Romeo:Noun | |
love:Noun | |
name:Noun | |
thy:Adjective | |
wherefore:Pronoun | |
longer:Adverb | |
O:Noun | |
art:Noun | |
a:Determiner | |
And:Conjunction | |
father:Noun | |
Deny:Noun | |
Or:Conjunction | |
sworn:Verb | |
refuse:Verb | |
詞形還原 | |
no:no | |
I:I | |
my:I | |
'll:will | |
wilt:wilt | |
thou:thou | |
not:not | |
if:if | |
be:be | |
and:and | |
but:but | |
Romeo:Romeo | |
love:love | |
name:name | |
thy:thee | |
wherefore:wherefore | |
longer:long | |
O:oh | |
art:art | |
a:a | |
And:and | |
father:father | |
Deny:deny | |
Or:or | |
sworn:sworn | |
refuse:refuse | |
具名實體辨識 | |
Romeo:PlaceName |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import Foundation | |
let 長恨歌 = """ | |
尋尋覓覓,冷冷清清, | |
淒淒慘慘戚戚。 | |
乍暖還寒時候,最難將息。 | |
三杯兩盞淡酒, | |
怎敵他、晚來風急。 | |
雁過也,正傷心,卻是舊時相識。 | |
滿地黃花堆積,憔悴損,如今有誰堪摘? | |
守著窗兒,獨自怎生得黑。 | |
梧桐更兼細雨,到黃昏,點點滴滴。 | |
這次第,怎一個愁字了得。 | |
""" | |
let Shakespeare = """ | |
O Romeo, Romeo! wherefore art thou Romeo? | |
Deny thy father and refuse thy name; | |
Or, if thou wilt not, be but sworn my love, | |
And I'll no longer be a Capulet | |
""" | |
extension String { | |
var 語言辨識: String? { | |
let tagger = NSLinguisticTagger( | |
tagSchemes: [.language], | |
options: 0) | |
tagger.string = self | |
return tagger.dominantLanguage | |
} | |
func tagger(unit: NSLinguisticTaggerUnit, | |
scheme: NSLinguisticTagScheme) -> [String:NSLinguisticTag] { | |
var tokens = [String:NSLinguisticTag]() | |
let tagger = NSLinguisticTagger( | |
tagSchemes: [scheme], | |
options: 0) | |
tagger.string = self | |
tagger.enumerateTags( | |
in: NSMakeRange(0, self.utf16.count), | |
unit: unit, | |
scheme: scheme, | |
options: [.omitWhitespace, | |
.omitPunctuation, | |
.joinNames]) { | |
(tag, tokenRange, stop) in | |
let word = (self as NSString).substring(with: tokenRange) | |
if tag != nil { | |
tokens[word] = tag | |
} | |
} | |
return tokens | |
} | |
var 斷詞: [String] { | |
let tagger = self.tagger(unit: .word, scheme: .tokenType) | |
return Array(tagger.keys) | |
} | |
var 詞性標記: [String:NSLinguisticTag] { | |
return self.tagger(unit: .word, scheme: .lexicalClass).filter{ (string, tag) -> Bool in | |
return tag != .otherWord | |
} | |
} | |
var 詞形還原: [String:NSLinguisticTag] { | |
return self.tagger(unit: .word, scheme: .lemma) | |
} | |
var 具名實體辨識: [String:NSLinguisticTag] { | |
return self.tagger(unit: .word, scheme: .nameType).filter{ (string, tag) -> Bool in | |
return tag != .otherWord && tag != .other | |
} | |
} | |
} | |
for text in [長恨歌, Shakespeare] { | |
print("語言辨識:\(text.語言辨識!)") | |
print("斷詞:\(text.斷詞)") | |
print("詞性標記") | |
for (word, tag) in text.詞性標記 { | |
print("\(word):\(tag.rawValue)") | |
} | |
print("詞形還原") | |
for (word, tag) in text.詞形還原 { | |
print("\(word):\(tag.rawValue)") | |
} | |
print("具名實體辨識") | |
for (word, tag) in text.具名實體辨識 { | |
print("\(word):\(tag.rawValue)") | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment