How to normalize and create similarity matrix in Pyspark? - python

I have seen many stack overflow questions about similarity matrix but they deal with RDD or other cases and I could not find the direct answer to my problem and I decided to post a new question.
Problem
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import functions as F, Window
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler,Normalizer
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
spark = pyspark.sql.SparkSession.builder.appName('app').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
# pandas dataframe
pdf = pd.DataFrame({'user_id': ['user_0','user_1','user_2'],
'apple': [0,1,5],
'good banana': [3,0,1],
'carrot': [1,2,2]})
# spark dataframe
df = sqlContext.createDataFrame(pdf)
df.show()
+-------+-----+-----------+------+
|user_id|apple|good banana|carrot|
+-------+-----+-----------+------+
| user_0| 0| 3| 1|
| user_1| 1| 0| 2|
| user_2| 5| 1| 2|
+-------+-----+-----------+------+
Normalize and create Similarity Matrix using Pandas
from sklearn.preprocessing import normalize
pdf = pdf.set_index('user_id')
item_norm = normalize(pdf,axis=0) # normalize each items (NOT users)
item_sim = item_norm.T.dot(item_norm)
df_item_sim = pd.DataFrame(item_sim,index=pdf.columns,columns=pdf.columns)
apple good banana carrot
apple 1.000000 0.310087 0.784465
good banana 0.310087 1.000000 0.527046
carrot 0.784465 0.527046 1.000000
Question: how to get the similarity matrix like above using PySpark?
I want to run KMeans on that data.
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
# I want to do this...
model = KMeans(k=2, seed=1).fit(df.select('norm_features'))
df = model.transform(df)
df.show()
References
Cosine Similarity for two pyspark dataframes
Apache Spark Python Cosine Similarity over DataFrames

import pyspark.sql.functions as F
df.show()
+-------+-----+-----------+------+
|user_id|apple|good banana|carrot|
+-------+-----+-----------+------+
| user_0| 0| 3| 1|
| user_1| 1| 0| 2|
| user_2| 5| 1| 2|
+-------+-----+-----------+------+
Swap rows and columns by unpivoting and pivoting:
df2 = df.selectExpr(
'user_id',
'stack(3, ' + ', '.join(["'%s', `%s`" % (c, c) for c in df.columns[1:]]) + ') as (fruit, items)'
).groupBy('fruit').pivot('user_id').agg(F.first('items'))
df2.show()
+-----------+------+------+------+
| fruit|user_0|user_1|user_2|
+-----------+------+------+------+
| apple| 0| 1| 5|
|good banana| 3| 0| 1|
| carrot| 1| 2| 2|
+-----------+------+------+------+
Normalize:
df3 = df2.select(
'fruit',
*[
(
F.col(c) /
F.sqrt(
sum([F.col(cc)*F.col(cc) for cc in df2.columns[1:]])
)
).alias(c) for c in df2.columns[1:]
]
)
df3.show()
+-----------+------------------+-------------------+-------------------+
| fruit| user_0| user_1| user_2|
+-----------+------------------+-------------------+-------------------+
| apple| 0.0|0.19611613513818404| 0.9805806756909202|
|good banana|0.9486832980505138| 0.0|0.31622776601683794|
| carrot|0.3333333333333333| 0.6666666666666666| 0.6666666666666666|
+-----------+------------------+-------------------+-------------------+
Do the matrix multiplication:
df4 = (df3.alias('t1').repartition(10)
.crossJoin(df3.alias('t2').repartition(10))
.groupBy('t1.fruit')
.pivot('t2.fruit', df.columns[1:])
.agg(F.first(sum([F.col('t1.'+c) * F.col('t2.'+c) for c in df3.columns[1:]])))
)
df4.show()
+-----------+-------------------+-------------------+------------------+
| fruit| apple| good banana| carrot|
+-----------+-------------------+-------------------+------------------+
| apple| 1.0000000000000002|0.31008683647302115|0.7844645405527362|
|good banana|0.31008683647302115| 0.9999999999999999|0.5270462766947298|
| carrot| 0.7844645405527362| 0.5270462766947298| 1.0|
+-----------+-------------------+-------------------+------------------+

Related

filter then count for many different threshold

I want to calculate the number of lines that satisfy a condition on a very large dataframe which can be achieved by
df.filter(col("value") >= thresh).count()
I want to know the result for each threshold in range [1, 10]. Enumerate each threshold then do this action will scan the dataframe for 10 times. It's slow.
If I can achieve it by scanning the df only once?
Create an indicator column for each threshold, then sum:
import random
import pyspark.sql.functions as F
from pyspark.sql import Row
df = spark.createDataFrame([Row(value=random.randint(0,10)) for _ in range(1_000_000)])
df.select([
(F.col("value") >= thresh)
.cast("int")
.alias(f"ind_{thresh}")
for thresh in range(1,11)
]).groupBy().sum().show()
# +----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+
# |sum(ind_1)|sum(ind_2)|sum(ind_3)|sum(ind_4)|sum(ind_5)|sum(ind_6)|sum(ind_7)|sum(ind_8)|sum(ind_9)|sum(ind_10)|
# +----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+
# | 908971| 818171| 727240| 636334| 545463| 454279| 363143| 272460| 181729| 90965|
# +----------+----------+----------+----------+----------+----------+----------+----------+----------+-----------+
Using conditional aggregation with when expressions should do the job.
Here's an example:
from pyspark.sql import functions as F
df = spark.createDataFrame([(1,), (2,), (3,), (4,), (4,), (6,), (7,)], ["value"])
count_expr = [
F.count(F.when(F.col("value") >= th, 1)).alias(f"gte_{th}")
for th in range(1, 11)
]
df.select(*count_expr).show()
#+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
#|gte_1|gte_2|gte_3|gte_4|gte_5|gte_6|gte_7|gte_8|gte_9|gte_10|
#+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
#| 7| 6| 5| 4| 2| 2| 1| 0| 0| 0|
#+-----+-----+-----+-----+-----+-----+-----+-----+-----+------+
Using a user-defined function udf from pyspark.sql.functions:
import pandas as pd
import numpy as np
df = pd.DataFrame(np.random.randint(0,100, size=(20)), columns=['val'])
thres = [90, 80, 30] # these are the thresholds
thres.sort(reverse=True) # list needs to be sorted
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
spark = SparkSession.builder \
.master("local[2]") \
.appName("myApp") \
.getOrCreate()
sparkDF = spark.createDataFrame(df)
myUdf = udf(lambda x: 0 if x>thres[0] else 1 if x>thres[1] else 2 if x>thres[2] else 3)
sparkDF = sparkDF.withColumn("rank", myUdf(sparkDF.val))
sparkDF.show()
# +---+----+
# |val|rank|
# +---+----+
# | 28| 3|
# | 54| 2|
# | 19| 3|
# | 4| 3|
# | 74| 2|
# | 62| 2|
# | 95| 0|
# | 19| 3|
# | 55| 2|
# | 62| 2|
# | 33| 2|
# | 93| 0|
# | 81| 1|
# | 41| 2|
# | 80| 2|
# | 53| 2|
# | 14| 3|
# | 16| 3|
# | 30| 3|
# | 77| 2|
# +---+----+
sparkDF.groupby(['rank']).count().show()
# Out:
# +----+-----+
# |rank|count|
# +----+-----+
# | 3| 7|
# | 0| 2|
# | 1| 1|
# | 2| 10|
# +----+-----+
A value gets rank i if it's strictly greater than thres[i] but smaller or equal thres[i-1]. This should minimize the number of comparisons.
For thres = [90, 80, 30] we have the ranks 0-> [max, 90[, 1-> [90, 80[, 2->[80, 30[, 3->[30, min]

Compare two different columns from two different pyspark dataframe

I'm trying to compare two different columns which are in two different data frames, and if I found a match I'm returning value 1 else None -
df1 =
df2 =
df1 (Expected_Output) =
I have tried the below code -
def getImpact(row):
match = df2.filter(df2.second_key == row)
if match.count() > 0:
return 1
return None
udf_sol = udf(lambda x: getImpact(x), IntegerType())
df1 = df1.withcolumn('impact',udf_sol(df1.first_key))
But getting below error -
TypeError: cannot pickle '_thread.RLock' object
Can anyone help me to achieve the expected output as shown above?
Thanks
Assuming first_key and second_key are unique , you can opt for a join across the dataframes -
More examples and explanation can be found here
from pyspark import SparkContext
from pyspark.sql import SQLContext
from functools import reduce
import pyspark.sql.functions as F
from pyspark.sql import Window
data_list1 = [
("abcd","Key1")
,("jkasd","Key2")
,("oigoa","Key3")
,("ad","Key4")
,("bas","Key5")
,("lkalsjf","Key6")
,("bsawva","Key7")
]
data_list2 = [
("cashj","Key1",10)
,("ax","Key11",12)
,("safa","Key5",21)
,("safasf","Key6",78)
,("vasv","Key3",4)
,("wgaga","Key8",0)
,("saasfas","Key7",10)
]
sparkDF1 = sql.createDataFrame(data_list1,['data','first_key'])
sparkDF2 = sql.createDataFrame(data_list2,['temp_data','second_key','frinks'])
>>> sparkDF1
+-------+---------+
| data|first_key|
+-------+---------+
| abcd| Key1|
| jkasd| Key2|
| oigoa| Key3|
| ad| Key4|
| bas| Key5|
|lkalsjf| Key6|
| bsawva| Key7|
+-------+---------+
>>> sparkDF2
+---------+----------+------+
|temp_data|second_key|frinks|
+---------+----------+------+
| cashj| Key1| 10|
| ax| Key11| 12|
| safa| Key5| 21|
| safasf| Key6| 78|
| vasv| Key3| 4|
| wgaga| Key8| 0|
| saasfas| Key7| 10|
+---------+----------+------+
#### Joining the dataframes on common columns
finalDF = sparkDF1.join(
sparkDF2
,(sparkDF1['first_key'] == sparkDF2['second_key'])
,'left'
).select(sparkDF1['*'],sparkDF2['frinks']).orderBy('frinks')
### Identifying impact if the frinks value is Null or Not
finalDF = finalDF.withColumn('impact',F.when(F.col('frinks').isNull(),0).otherwise(1))
>>> finalDF.show()
+-------+---------+------+------+
| data|first_key|frinks|impact|
+-------+---------+------+------+
| jkasd| Key2| null| 0|
| ad| Key4| null| 0|
| oigoa| Key3| 4| 1|
| abcd| Key1| 10| 1|
| bsawva| Key7| 10| 1|
| bas| Key5| 21| 1|
|lkalsjf| Key6| 78| 1|
+-------+---------+------+------+
import numpy as np
df1['final']= np.where(df1['first_key']==df2['second_key'],'1','None')

Applying map function on dataframe's columns

I need to merge all the values of the dataframe's columns into a single value for each column. So the columns stay intact but I am just summing all the respective values.
For this purpose I intend to utilize this function:
def sum_col(data, col):
return data.select(f.sum(col)).collect()[0][0]
I was now thinking to do sth like this:
data = data.map(lambda current_col: sum_col(data, current_col))
Is this doable, or I need another way to merge all the values of the columns?
You can achieve this by sum function
import pyspark.sql.functions as f
df.select(*[f.sum(cols).alias(cols) for cols in df.columns]).show()
+----+---+---+
|val1| x| y|
+----+---+---+
| 36| 29|159|
+----+---+---+
To sum all your columns to a new column you can use list comprehension with the sum function of python
import pyspark.sql.functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import *
tst= sqlContext.createDataFrame([(10,7,14),(5,1,4),(9,8,10),(2,6,90),(7,2,30),(3,5,11)],schema=['val1','x','y'])
tst_sum= tst.withColumn("sum_col",sum([tst[coln] for coln in tst.columns]))
results:
tst_sum.show()
+----+---+---+-------+
|val1| x| y|sum_col|
+----+---+---+-------+
| 10| 7| 14| 31|
| 5| 1| 4| 10|
| 9| 8| 10| 27|
| 2| 6| 90| 98|
| 7| 2| 30| 39|
| 3| 5| 11| 19|
+----+---+---+-------+
Note : If you had imported sum function from pyspark function as from import pyspark.sql.functions import sum then you have to change the name to some thing else , like from import pyspark.sql.functions import sum_pyspark

Append list of lists as column to PySpark's dataframe (Concatenating two dataframes without common column)

I have some dataframe in Pyspark:
from pyspark.sql import SQLContext, SparkSession
spark = SparkSession.builder.getOrCreate()
sqlcontext = SQLContext(spark)
df = sqlcontext.createDataFrame([['a'],['b'],['c'],['d'],['e']], ['id'])
df.show()
+---+
| id|
+---+
| a|
| b|
| c|
| d|
| e|
+---+
And I have a list of lists:
l = [[1,1], [2,2], [3,3], [4,4], [5,5]]
Is it possible to append this list as a column to df? Namely, the first element of l should appear next to the first row of df, the second element of l next to the second row of df, etc. It should look like this:
+----+---+--+
| id| l|
+----+---+--+
| a| [1,1]|
| b| [2,2]|
| c| [3,3]|
| d| [4,4]|
| e| [5,5]|
+----+---+--+
UDF's are generally slow but a more efficient way without using any UDF's would be:
import pyspark.sql.functions as F
ldf = spark.createDataFrame(l, schema = "array<int>")
df1 = df.withColumn("m_id", F.monotonically_increasing_id())
df2 = ldf.withColumn("m_id", F.monotonically_increasing_id())
df3 = df2.join(df1, "m_id", "outer").drop("m_id")
df3.select("id", "value").show()
+---+------+
| id| value|
+---+------+
| a|[1, 1]|
| b|[2, 2]|
| d|[4, 4]|
| c|[3, 3]|
| e|[5, 5]|
+---+------+
Assuming that you are going to have same amount of rows in your df and items in your list (df.count==len(l)).
You can add a row_id (to specify the order) to your df, and based on that, access to the item on your list (l).
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import *
df = df.withColumn("row_num", row_number().over(Window().orderBy(lit('A'))))
df.show()
Above code will look like:
+---+-------+
| id|row_num|
+---+-------+
| 1| 1|
| 2| 2|
| 3| 3|
| 4| 4|
| 5| 5|
+---+-------+
Then, you can just iterate your df and access the specified index in your list:
def map_df(row):
return (row.id, l[row.row_num-1])
new_df = df.rdd.map(map_df).toDF(["id", "l"])
new_df.show()
Output:
+---+------+
| id| l|
+---+------+
| 1|[1, 1]|
| 2|[2, 2]|
| 3|[3, 3]|
| 4|[4, 4]|
| 5|[5, 5]|
+---+------+
Thanks to Cesar's answer, I figured out how to do it without making the dataframe an RDD and coming back. It would be something like this:
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import row_number, lit, udf
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, FloatType, IntegerType
spark = SparkSession.builder.getOrCreate()
sqlcontext = SQLContext(spark)
df = sqlcontext.createDataFrame([['a'],['b'],['c'],['d'],['e']], ['id'])
df = df.withColumn("row_num", row_number().over(Window().orderBy(lit('A'))))
new_col = [[1.,1.], [2.,2.], [3.,3.], [4.,4.], [5.,5.]]
map_list_to_column = udf(lambda row_num: new_col[row_num -1], ArrayType(FloatType()))
df.withColumn('new_col', map_list_to_column(df.row_num)).drop('row_num').show()

Spark pandas_udf is not faster

I'm facing a heavy data transformation. In a nutshell, I have columns of data, each containing strings which correspond to some ordinals. For example, HIGH, MID and LOW. My objective is to map these strings to integers which will preserve the order. In this case, LOW -> 0, MID -> 1 and HIGH -> 2.
Here is a simple function generating such data:
def fresh_df(N=100000, seed=None):
np.random.seed(seed)
feat1 = np.random.choice(["HI", "LO", "MID"], size=N)
feat2 = np.random.choice(["SMALL", "MEDIUM", "LARGE"], size=N)
pdf = pd.DataFrame({
"feat1": feat1,
"feat2": feat2
})
return spark.createDataFrame(pdf)
My first approach was:
feat1_dict = {"HI": 1, "MID": 2, "LO": 3}
feat2_dict = {"SMALL": 0, "MEDIUM": 1, "LARGE": 2}
mappings = {
"feat1": F.create_map([F.lit(x) for x in chain(*feat1_dict.items())]),
"feat2": F.create_map([F.lit(x) for x in chain(*feat2_dict.items())])
}
for col in df.columns:
col_map = mappings[col]
df = df.withColumn(col+"_mapped", col_map[df[col]])
This works as expected but in reality it turns to be slow and I wanted to optimize the process. I read about pandas_udf and it gave me hope. Here is the modified code:
feats_dict = {
"feat1": feat1_dict,
"feat2": feat2_dict
}
for col_name in df.columns:
#F.pandas_udf('integer', F.PandasUDFType.SCALAR)
def map_map(col):
return col.map(feats_dict[col_name])
df = df.withColumn(col_name + "_mapped", map_map(df[col_name]))
Alas! When comparing these two versions there was no improvement in terms of execution time. I compared the two on a local instance of Spark (using docker) and on a 5 nodes EMR cluster (with the default configurations).
I created a notebook where you can see all the code. In general, I used the following imports:
import numpy as np
import pandas as pd
from itertools import chain
from pyspark.sql import functions as F
What am I missing? Why is this process so slow and why there's no improvement when using pandas_udf?
Why so slow? Because the Spark runs in JVM and pyspark doesn't (because its a python process) and to make it the process possible needs to move all data serializing and deserializing to JVM.
You can map the values with when and otherwise function and avoid the serialize and deserialize process, increasing the performance.
import numpy as np
import pandas as pd
import pyspark.sql.functions as f
from pyspark.shell import spark
def fresh_df(n=100000, seed=None):
np.random.seed(seed)
feat1 = np.random.choice(["HI", "LO", "MID"], size=n)
feat2 = np.random.choice(["SMALL", "MEDIUM", "LARGE"], size=n)
pdf = pd.DataFrame({
"feat1": feat1,
"feat2": feat2
})
return spark.createDataFrame(pdf)
df = fresh_df()
df = df.withColumn('feat1_mapped', f
.when(df.feat1 == f.lit('HI'), 1)
.otherwise(f.when(df.feat1 == f.lit('MID'), 2).otherwise(3)))
df = df.withColumn('feat2_mapped', f
.when(df.feat2 == f.lit('SMALL'), 0)
.otherwise(f.when(df.feat2 == f.lit('MEDIUM'), 1).otherwise(2)))
df.show(n=20)
Output
+-----+------+------------+------------+
|feat1| feat2|feat1_mapped|feat2_mapped|
+-----+------+------------+------------+
| LO| SMALL| 3| 0|
| LO|MEDIUM| 3| 1|
| MID|MEDIUM| 2| 1|
| MID| SMALL| 2| 0|
| MID| LARGE| 2| 2|
| MID| SMALL| 2| 0|
| LO| SMALL| 3| 0|
| MID| LARGE| 2| 2|
| MID| LARGE| 2| 2|
| MID| SMALL| 2| 0|
| MID|MEDIUM| 2| 1|
| LO| LARGE| 3| 2|
| HI|MEDIUM| 1| 1|
| LO| SMALL| 3| 0|
| HI|MEDIUM| 1| 1|
| MID| SMALL| 2| 0|
| MID|MEDIUM| 2| 1|
| HI| SMALL| 1| 0|
| HI| LARGE| 1| 2|
| MID| LARGE| 2| 2|
+-----+------+------------+------------+

Categories

Resources