-
-
Save vgiri2015/82e706e9bbfba35c55bf3c8f5d40a0af to your computer and use it in GitHub Desktop.
#Create a Method to handle the Non Ascii to Ascii conversion | |
def nonasciitoascii(unicodestring): | |
return unicodestring.encode("ascii","ignore") | |
#Create a Sample Dataframe | |
from pyspark.sql.window import Window | |
from pyspark.sql.functions import count, col | |
from pyspark.sql import Row | |
d=[ Row(coltype='regular', value="Happy Coding"), | |
Row(coltype='non ascii', value="hello aåbäcö"), | |
Row(coltype='non ascii',value="6Â 918Â 417Â 712"), | |
Row(coltype='non ascii',value="SAN MATEO� �?A "), | |
Row(coltype='non ascii',value="SAINT-LOUIS (CANADA� � AL)")] | |
data = sqlContext.createDataFrame(d) | |
#data.show() | |
data = sqlContext.createDataFrame(d) | |
#Apply this Conversion on the Dataframe | |
convertedudf = udf(nonasciitoascii) | |
converted = data.select('coltype','value').withColumn('converted',convertedudf(data.value)) | |
converted.show() |
@duginivijay The udf
should be imported like from pyspark.sql.functions import udf
.
Hi , i get this as the output when i run your code . How do i get the correct value for converted ? Please can you help .
+---------+--------------------+-----------+
| coltype| value| converted|
+---------+--------------------+-----------+
| regular| Happy Coding| [B@2d5d8dd4|
|non ascii| hello aåbäcö| [B@1f1744a2|
|non ascii| 6Â 918Â 417Â 712| [B@7fbc580a|
|non ascii| SAN MATEO� �?A |[B@61ef3346|
|non ascii|SAINT-LOUIS (CANA...| [B@534ecdd5|
+---------+--------------------+-----------+
@Arpit9873 I know this is a very old thread, but for anyone else who might come across this one day - You just have to cast the return object as a string like this:
def nonasciitoascii(unicodestring):
return str(unicodestring.encode("ascii", "replace"))
I was trying to find where did you define udf or is it a predefined and you loaded any package for it?