Created
April 14, 2020 01:30
-
-
Save imfht/63c878208502bf088c35b59cdc203a05 to your computer and use it in GitHub Desktop.
A example show how to use cos with pyspark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from pyspark import SparkContext | |
# first you'll need download hadoop-cos-x.x.x-shaded.jar at -> https://github.com/tencentyun/hadoop-cos | |
os.environ[ | |
'PYSPARK_SUBMIT_ARGS'] = './hadoop-cos-2.8.5-shaded.jar pyspark-shell' | |
sc = SparkContext(appName="wordCount").getOrCreate() | |
# some basic configuration, find more at https://cloud.tencent.com/document/product/436/6884 | |
sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretId", "ak") # ak | |
sc._jsc.hadoopConfiguration().set("fs.cosn.userinfo.secretKey", "sk") # sk | |
sc._jsc.hadoopConfiguration().set("fs.cosn.bucket.region", "ap-guangzhou") # ap | |
sc._jsc.hadoopConfiguration().set("fs.cosn.impl", "org.apache.hadoop.fs.CosFileSystem") # register cosn scheme | |
# then create a rdd via cos key | |
text_file = sc.textFile("cosn://ap_name/filename") | |
# wordcount | |
counts = text_file.flatMap(lambda line: line.split(" ")) \ | |
.map(lambda word: (word, 1)) \ | |
.reduceByKey(lambda a, b: a + b) | |
# save it! | |
counts.saveAsTextFile("anywhere") | |
if __name__ == '__main__': | |
pass |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
and https://github.com/apache/hadoop/blob/trunk/hadoop-cloud-storage-project/hadoop-cos/src/site/markdown/cloud-storage/index.md might helps