Last active
December 23, 2020 09:03
-
-
Save dalei2019/6996bd4b69a49d9b88a357e184989c1e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def split(): Unit = { | |
import org.apache.spark.SparkConf | |
import org.ansj.library.DicLibrary | |
import org.ansj.splitWord.analysis.DicAnalysis | |
import org.apache.spark.sql.{SaveMode, SparkSession} | |
val conf = new SparkConf().setAppName("HelloWorld") | |
val spark = SparkSession | |
.builder() | |
.config(conf) | |
.enableHiveSupport() | |
.getOrCreate() | |
//存储自定义词库的 HDFS 路径 | |
val dicPath = "/tmp/user.dic" | |
//自定义词库的 key 名(可以任意指定) | |
val dicKey = "HelloDic" | |
//分词测试数据 | |
val str = "杭州排名第一的网红店" | |
val text = spark.sparkContext.parallelize(Seq(str)) | |
//创建词典 | |
// spark.sparkContext.parallelize(Seq("")).toDF().write.format("csv").mode(SaveMode.Overwrite).save(dicPath) | |
spark.sparkContext.parallelize(Seq("网红店")).toDF().write.format("csv").mode(SaveMode.Overwrite).save(dicPath) | |
//加载词典 | |
val dic = sc.textFile(dicPath).cache() | |
//初始化词典 | |
dic.collect().foreach(line => | |
DicLibrary.insertOrCreate(dicKey, line.split(",")(0), DicLibrary.DEFAULT_NATURE, DicLibrary.DEFAULT_FREQ) | |
) | |
//广播词典 | |
val userDic = spark.sparkContext.broadcast(DicLibrary.get(dicKey)) | |
//分词 | |
text.mapPartitions(iter => { | |
iter.map(line => | |
DicAnalysis.parse(line, userDic.value).toStringWithOutNature(" ") | |
) | |
}).take(1).foreach(println) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment