I want to know how to map values in a specific column in a dataframe.
I have a dataframe which looks like:
df = sc.parallelize([('india','japan'),('usa','uruguay')]).toDF(['col1','col2'])
+-----+-------+
| col1| col2|
+-----+-------+
|india| japan|
| usa|uruguay|
+-----+-------+
I have a dictionary from where I want to map the values.
dicts = sc.parallelize([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')])
The output I want is:
+-----+-------+--------+--------+
| col1| col2|col1_map|col2_map|
+-----+-------+--------+--------+
|india| japan| ind| jpn|
| usa|uruguay| us| urg|
+-----+-------+--------+--------+
I have tried using the lookup function but it doesn't work. It throws error SPARK-5063. Following is my approach which failed:
def map_val(x):
return dicts.lookup(x)[0]
myfun = udf(lambda x: map_val(x), StringType())
df = df.withColumn('col1_map', myfun('col1')) # doesn't work
df = df.withColumn('col2_map', myfun('col2')) # doesn't work
I think the easier way is just to use a simple dictionary and df.withColumn.
from itertools import chain
from pyspark.sql.functions import create_map, lit
simple_dict = {'india':'ind', 'usa':'us', 'japan':'jpn', 'uruguay':'urg'}
mapping_expr = create_map([lit(x) for x in chain(*simple_dict.items())])
df = df.withColumn('col1_map', mapping_expr[df['col1']])\
.withColumn('col2_map', mapping_expr[df['col2']])
df.show(truncate=False)
udf way
I would suggest you to change the list of tuples to dicts and broadcast it to be used in udf
dicts = sc.broadcast(dict([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')]))
from pyspark.sql import functions as f
from pyspark.sql import types as t
def newCols(x):
return dicts.value[x]
callnewColsUdf = f.udf(newCols, t.StringType())
df.withColumn('col1_map', callnewColsUdf(f.col('col1')))\
.withColumn('col2_map', callnewColsUdf(f.col('col2')))\
.show(truncate=False)
which should give you
+-----+-------+--------+--------+
|col1 |col2 |col1_map|col2_map|
+-----+-------+--------+--------+
|india|japan |ind |jpn |
|usa |uruguay|us |urg |
+-----+-------+--------+--------+
join way (slower than udf way)
All you have to do is change the dicts rdd to dataframe too and use two joins with aliasings as following
df = sc.parallelize([('india','japan'),('usa','uruguay')]).toDF(['col1','col2'])
dicts = sc.parallelize([('india','ind'), ('usa','us'),('japan','jpn'),('uruguay','urg')]).toDF(['key', 'value'])
from pyspark.sql import functions as f
df.join(dicts, df['col1'] == dicts['key'], 'inner')\
.select(f.col('col1'), f.col('col2'), f.col('value').alias('col1_map'))\
.join(dicts, df['col2'] == dicts['key'], 'inner') \
.select(f.col('col1'), f.col('col2'), f.col('col1_map'), f.col('value').alias('col2_map'))\
.show(truncate=False)
which should give you the same result
Similar to Ali AzG, but pulling it all out into a handy little method if anyone finds it useful
from itertools import chain
from pyspark.sql import DataFrame
from pyspark.sql import functions as F
from typing import Dict
def map_column_values(df:DataFrame, map_dict:Dict, column:str, new_column:str="")->DataFrame:
"""Handy method for mapping column values from one value to another
Args:
df (DataFrame): Dataframe to operate on
map_dict (Dict): Dictionary containing the values to map from and to
column (str): The column containing the values to be mapped
new_column (str, optional): The name of the column to store the mapped values in.
If not specified the values will be stored in the original column
Returns:
DataFrame
"""
spark_map = F.create_map([F.lit(x) for x in chain(*map_dict.items())])
return df.withColumn(new_column or column, spark_map[df[column]])
This can be used as follows
from pyspark.sql import Row, SparkSession
spark = SparkSession.builder.master("local[3]").getOrCreate()
df = spark.createDataFrame([Row(A=0), Row(A=1)])
df = map_column_values(df, map_dict={0:"foo", 1:"bar"}, column="A", new_column="B")
df.show()
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#+---+---+
#| A| B|
#+---+---+
#| 0|foo|
#| 1|bar|
#+---+---+
Related
I have a pyspark dataframe
import pandas as pd
foo = pd.DataFrame({'col':[['a_b', 'bad'],['a_a', 'good'],[]]})
I would like to filter out all the rows for which 'bad' is in the list of col
I have tried to first create a binary column and then filter on this one:
from pyspark.sql import functions as f
foo = foo.withColumn('at_least_one_bad', f.when(f.col("col").array_contains("bad"),f.lit(1)).otherwise(f.lit(0)))
but I get an error
TypeError: 'Column' object is not callable
Any ideas?
Your syntax is slightly off - try this code below:
import pyspark.sql.functions as f
foo2 = foo.withColumn('at_least_one_bad', f.array_contains('col', 'bad').cast('int'))
foo2.show()
+-----------+----------------+
| col|at_least_one_bad|
+-----------+----------------+
| [a_b, bad]| 1|
|[a_a, good]| 0|
| []| 0|
+-----------+----------------+
I am using a custom function as part of a reduce operation. For the following example I am getting the following message TypeError: reduce() takes no keyword arguments - I believe this is due to the way I am using the dictionary mapping in the function exposed_colum - Could you please help me fix this function?
from pyspark.sql import DataFrame, Row
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from functools import reduce
def process_data(df: DataFrame):
col_mapping = dict(zip(["name", "age"], ["a", "b"]))
# Do other things...
def exposed_column(df: DataFrame, mapping: dict):
return df.select([col(c).alias(mapping.get(c, c)) for c in df.columns])
return reduce(exposed_column, sequence=col_mapping, initial=df)
spark = SparkSession.builder.appName("app").getOrCreate()
l = [
("Bob", 25, "Spain"),
("Marc", 22, "France"),
("Steve", 20, "Belgium"),
("Donald", 26, "USA"),
]
rdd = spark.sparkContext.parallelize(l)
people = rdd.map(lambda x: Row(name=x[0], age=int(x[1]), country=x[2])).toDF()
people.show()
process_data(people).show()
people.show() is looking like this
+---+-------+------+
|age|country| name|
+---+-------+------+
| 25| Spain| Bob|
| 22| France| Marc|
| 20|Belgium| Steve|
| 26| USA|Donald|
+---+-------+------+
And this is the expected output
+------+---+
| a| b|
+------+---+
| Bob| 25|
| Marc| 22|
| Steve| 20|
|Donald| 26|
+------+---+
reduce does not take keywords, that’s true.
Once you remove the keywords, you’ll notice a more serious issue though: when you iterate over a dictionary, you’re iterating over its keys only. So the function in which you're trying to batch rename the columns won’t do what you had in mind.
One way to do a batch column rename, would be to iterate over the dictionary’s items:
from typing import Mapping
from pyspark.sql import DataFrame
def rename_columns(frame: DataFrame, mapping: Mapping[str, str]) -> DataFrame:
return reduce(lambda f, old_new: f.withColumnRenamed(old_new[0], old_new[1]),
mapping.items(), frame)
This allows you to pass in a dictionary (note that the recommendation for adding type hints to arguments is to use Mapping, not dict) that maps column names to other names. Fortunately, withColumnRenamed won’t complain if you try to rename a column that isn’t in the DataFrame, so this is equivalent to your mapping.get(c, c).
One thing I’m not noticing in your code is that it is dropping the country column. So that’ll still be in your output.
I have created a dataframe as shown
import ast
from pyspark.sql.functions import udf
values = [(u'['2','4','713',10),(u'['12','245']',20),(u'['101','12']',30)]
df = sqlContext.createDataFrame(values,['list','A'])
df.show()
+-----------------+---+
| list| A|
+-----------------+---+
|u'['2','4','713']| 10|
| u' ['12','245']| 20|
| u'['101','12',]| 30|
+-----------------+---+
**How can I convert the above dataframe such that each element in the list is a float and is within a proper list**
I tried the below one :
def df_amp_conversion(df_modelamp):
string_list_to_list = udf(lambda row: ast.literal_eval(str(row)))
df_modelamp = df_modelamp.withColumn('float_list',string_list_to_list(col("list")))
df2 = amp_conversion(df)
But the data remains the same without a change.
I dont want convert the dataframe to pandas or use collect as it is memory intensive.
And if possible try to give me an optimal solution.I am using pyspark
That's because you forgot about the type
udf(lambda row: ast.literal_eval(str(row)), "array<integer>")
Though something like this would be more efficient:
from pyspark.sql.functions import rtrim, ltrim, split
df = spark.createDataFrame(["""u'[23,4,77,890,4]"""], "string").toDF("list")
df.select(split(
regexp_replace("list", "^u'\\[|\\]$", ""), ","
).cast("array<integer>").alias("list")).show()
# +-------------------+
# | list|
# +-------------------+
# |[23, 4, 77, 890, 4]|
# +-------------------+
I can create the true result in python 3 with a little change in definition of function df_amp_conversion. You didn't return the value of df_modelamp! This code works for me properly:
import ast
from pyspark.sql.functions import udf, col
values = [(u"['2','4','713']",10),(u"['12','245']",20),(u"['101','12']",30)]
df = sqlContext.createDataFrame(values,['list','A'])
def df_amp_conversion(df_modelamp):
string_list_to_list = udf(lambda row: ast.literal_eval(str(row)))
df_modelamp = df_modelamp.withColumn('float_list',string_list_to_list(col("list")))
return df_modelamp
df2 = df_amp_conversion(df)
df2.show()
# +---------------+---+-----------+
# | list| A| float_list|
# +---------------+---+-----------+
# |['2','4','713']| 10|[2, 4, 713]|
# | ['12','245']| 20| [12, 245]|
# | ['101','12']| 30| [101, 12]|
# +---------------+---+-----------+
So I have a dataframe df like so,
+---+-----+
| ID|COL_A|
+---+-----+
| 1| 123|
+---+-----+
I also have a dict like so:
{"COL_B":"abc","COL_C":""}
Now, what I have to do is to update df with keys in dict being the new column name and the value of key being the costant value of the column.
Expected df should be like:
+---+-----+-----+-----+
| ID|COL_A|COL_B|COL_C|
+---+-----+-----+-----+
| 1| 123| abc| |
+---+-----+-----+-----+
Now here's my python code to do it which is working fine...
input_data = pd.read_csv(inputFilePath,dtype=str)
for key, value in mapRow.iteritems(): #mapRow is the dict
if value is None:
input_data[key] = ""
else:
input_data[key] = value
Now I'm migrating this code to pyspark and would like to know how to do it in pyspark?
Thanks for the help.
To combine RDDs, we use use zip or join . Below is the explanation using zip. zip is to concat them and map to flatten.
from pyspark.sql import Row
rdd_1 = sc.parallelize([Row(ID=1,COL_A=2)])
rdd_2 = sc.parallelize([Row(COL_B="abc",COL_C=" ")])
result_rdd = rdd_1.zip(rdd_2).map(lamda x: [j for i in x for j in i])
NOTE I didn't have payspark currently with me so this isn't tested.
This creates my example dataframe:
df = sc.parallelize([('abc',),('def',)]).toDF() #(
df = df.selectExpr("_1 as one",)
df = df.withColumn("two", lit('z'))
df.show()
looking like this:
+---+---+
|one|two|
+---+---+
|abc| z|
|def| z|
+---+---+
now what I want to do is a series of SQL where like statements where column two is appended whether or not it matches
in "pseudo code" it looks like this:
for letter in ['a','b','c','d']:
df = df['two'].where(col('one').like("%{}%".format(letter))) += letter
finally resulting in a df looking like this:
+---+----+
|one| two|
+---+----+
|abc|zabc|
|def| zd|
+---+----+
If you are using a list of strings to subset your string column, you can best use broadcast variables. Let's start with a more realistic example where your string still contain spaces:
df = sc.parallelize([('a b c',),('d e f',)]).toDF()
df = df.selectExpr("_1 as one",)
df = df.withColumn("two", lit('z'))
Then we create a broadcast variable from a list of letters, and consequently define an udf that uses them to subset a list of strings; and finally concatenates them with the value in another column, returning one string:
letters = ['a','b','c','d']
letters_bd = sc.broadcast(letters)
def subs(col1, col2):
l_subset = [x for x in col1 if x in letters_bd.value]
return col2 + ' ' + ' '.join(l_subset)
subs_udf = udf(subs)
To apply the above, the string we are subsetting need to be converted to a list, so we use the function split() first and then apply our udf:
from pyspark.sql.functions import col, split
df.withColumn("three", split(col('one'), r'\W+')) \
.withColumn("three", subs_udf("three", "two")) \
.show()
+-----+---+-------+
| one|two| three|
+-----+---+-------+
|a b c| z|z a b c|
|d e f| z| z d|
+-----+---+-------+
Or without udf, using regexp_replace and concat if your letters can be comfortably fit into the regex expression.
from pyspark.sql.functions import regexp_replace, col, concat, lit
df.withColumn("three", concat(col('two'), lit(' '),
regexp_replace(col('one'), '[^abcd]', ' ')))