I want to check each column of the pyspark dataframe and if the column meets specific dtypes then it will perform certain functions. below is my codes and dataset.
dataset:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local').appName('Word Count').config('spark.some.config.option', 'some-value').getOrCreate()
df = spark.createDataFrame(
[
('A',1),
('A', 2),
('A',3),
('A', 4),
('B',5),
('B', 6),
('B',7),
('B', 8),
],
['id', 'v']
) #I save this to csv so can just ignore my read csv park below.
Codes:
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark import SparkContext
sqlContext = SQLContext(sc)
df = sqlContext.read.load('test.csv',
format ='com.databricks.spark.csv',
header='true',
inferSchema='true')
from functools import reduce
from pyspark.sql.functions import col
import numpy as np
i = (reduce(lambda x, y: x.withColumn(y, np.where(col(y).dtypes != 'str', col(y)+2, col(y))), df.columns, df)) # this is the part that I wanted to change.
Side learning request: If possible can anyone tell me how to edit only specific column? I understand using .select but can someone show some examples with some dataset if possible. thank you
My expected output:
+---+---+
| id| v|
+---+---+
| A| 3|
| A| 4|
| A| 5|
| A| 6|
| B| 7|
| B| 8|
| B| 9|
| B| 10|
+---+---+
Side note: I am new to pyspark so I dont get why need to use 'col'. what is it anyway actually?
Related
I have seen many stack overflow questions about similarity matrix but they deal with RDD or other cases and I could not find the direct answer to my problem and I decided to post a new question.
Problem
import numpy as np
import pandas as pd
import pyspark
from pyspark.sql import functions as F, Window
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler,Normalizer
from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix
spark = pyspark.sql.SparkSession.builder.appName('app').getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)
# pandas dataframe
pdf = pd.DataFrame({'user_id': ['user_0','user_1','user_2'],
'apple': [0,1,5],
'good banana': [3,0,1],
'carrot': [1,2,2]})
# spark dataframe
df = sqlContext.createDataFrame(pdf)
df.show()
+-------+-----+-----------+------+
|user_id|apple|good banana|carrot|
+-------+-----+-----------+------+
| user_0| 0| 3| 1|
| user_1| 1| 0| 2|
| user_2| 5| 1| 2|
+-------+-----+-----------+------+
Normalize and create Similarity Matrix using Pandas
from sklearn.preprocessing import normalize
pdf = pdf.set_index('user_id')
item_norm = normalize(pdf,axis=0) # normalize each items (NOT users)
item_sim = item_norm.T.dot(item_norm)
df_item_sim = pd.DataFrame(item_sim,index=pdf.columns,columns=pdf.columns)
apple good banana carrot
apple 1.000000 0.310087 0.784465
good banana 0.310087 1.000000 0.527046
carrot 0.784465 0.527046 1.000000
Question: how to get the similarity matrix like above using PySpark?
I want to run KMeans on that data.
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans
# I want to do this...
model = KMeans(k=2, seed=1).fit(df.select('norm_features'))
df = model.transform(df)
df.show()
References
Cosine Similarity for two pyspark dataframes
Apache Spark Python Cosine Similarity over DataFrames
import pyspark.sql.functions as F
df.show()
+-------+-----+-----------+------+
|user_id|apple|good banana|carrot|
+-------+-----+-----------+------+
| user_0| 0| 3| 1|
| user_1| 1| 0| 2|
| user_2| 5| 1| 2|
+-------+-----+-----------+------+
Swap rows and columns by unpivoting and pivoting:
df2 = df.selectExpr(
'user_id',
'stack(3, ' + ', '.join(["'%s', `%s`" % (c, c) for c in df.columns[1:]]) + ') as (fruit, items)'
).groupBy('fruit').pivot('user_id').agg(F.first('items'))
df2.show()
+-----------+------+------+------+
| fruit|user_0|user_1|user_2|
+-----------+------+------+------+
| apple| 0| 1| 5|
|good banana| 3| 0| 1|
| carrot| 1| 2| 2|
+-----------+------+------+------+
Normalize:
df3 = df2.select(
'fruit',
*[
(
F.col(c) /
F.sqrt(
sum([F.col(cc)*F.col(cc) for cc in df2.columns[1:]])
)
).alias(c) for c in df2.columns[1:]
]
)
df3.show()
+-----------+------------------+-------------------+-------------------+
| fruit| user_0| user_1| user_2|
+-----------+------------------+-------------------+-------------------+
| apple| 0.0|0.19611613513818404| 0.9805806756909202|
|good banana|0.9486832980505138| 0.0|0.31622776601683794|
| carrot|0.3333333333333333| 0.6666666666666666| 0.6666666666666666|
+-----------+------------------+-------------------+-------------------+
Do the matrix multiplication:
df4 = (df3.alias('t1').repartition(10)
.crossJoin(df3.alias('t2').repartition(10))
.groupBy('t1.fruit')
.pivot('t2.fruit', df.columns[1:])
.agg(F.first(sum([F.col('t1.'+c) * F.col('t2.'+c) for c in df3.columns[1:]])))
)
df4.show()
+-----------+-------------------+-------------------+------------------+
| fruit| apple| good banana| carrot|
+-----------+-------------------+-------------------+------------------+
| apple| 1.0000000000000002|0.31008683647302115|0.7844645405527362|
|good banana|0.31008683647302115| 0.9999999999999999|0.5270462766947298|
| carrot| 0.7844645405527362| 0.5270462766947298| 1.0|
+-----------+-------------------+-------------------+------------------+
I have some dataframe in Pyspark:
from pyspark.sql import SQLContext, SparkSession
spark = SparkSession.builder.getOrCreate()
sqlcontext = SQLContext(spark)
df = sqlcontext.createDataFrame([['a'],['b'],['c'],['d'],['e']], ['id'])
df.show()
+---+
| id|
+---+
| a|
| b|
| c|
| d|
| e|
+---+
And I have a list of lists:
l = [[1,1], [2,2], [3,3], [4,4], [5,5]]
Is it possible to append this list as a column to df? Namely, the first element of l should appear next to the first row of df, the second element of l next to the second row of df, etc. It should look like this:
+----+---+--+
| id| l|
+----+---+--+
| a| [1,1]|
| b| [2,2]|
| c| [3,3]|
| d| [4,4]|
| e| [5,5]|
+----+---+--+
UDF's are generally slow but a more efficient way without using any UDF's would be:
import pyspark.sql.functions as F
ldf = spark.createDataFrame(l, schema = "array<int>")
df1 = df.withColumn("m_id", F.monotonically_increasing_id())
df2 = ldf.withColumn("m_id", F.monotonically_increasing_id())
df3 = df2.join(df1, "m_id", "outer").drop("m_id")
df3.select("id", "value").show()
+---+------+
| id| value|
+---+------+
| a|[1, 1]|
| b|[2, 2]|
| d|[4, 4]|
| c|[3, 3]|
| e|[5, 5]|
+---+------+
Assuming that you are going to have same amount of rows in your df and items in your list (df.count==len(l)).
You can add a row_id (to specify the order) to your df, and based on that, access to the item on your list (l).
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import *
df = df.withColumn("row_num", row_number().over(Window().orderBy(lit('A'))))
df.show()
Above code will look like:
+---+-------+
| id|row_num|
+---+-------+
| 1| 1|
| 2| 2|
| 3| 3|
| 4| 4|
| 5| 5|
+---+-------+
Then, you can just iterate your df and access the specified index in your list:
def map_df(row):
return (row.id, l[row.row_num-1])
new_df = df.rdd.map(map_df).toDF(["id", "l"])
new_df.show()
Output:
+---+------+
| id| l|
+---+------+
| 1|[1, 1]|
| 2|[2, 2]|
| 3|[3, 3]|
| 4|[4, 4]|
| 5|[5, 5]|
+---+------+
Thanks to Cesar's answer, I figured out how to do it without making the dataframe an RDD and coming back. It would be something like this:
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql.functions import row_number, lit, udf
from pyspark.sql.window import Window
from pyspark.sql.types import ArrayType, FloatType, IntegerType
spark = SparkSession.builder.getOrCreate()
sqlcontext = SQLContext(spark)
df = sqlcontext.createDataFrame([['a'],['b'],['c'],['d'],['e']], ['id'])
df = df.withColumn("row_num", row_number().over(Window().orderBy(lit('A'))))
new_col = [[1.,1.], [2.,2.], [3.,3.], [4.,4.], [5.,5.]]
map_list_to_column = udf(lambda row_num: new_col[row_num -1], ArrayType(FloatType()))
df.withColumn('new_col', map_list_to_column(df.row_num)).drop('row_num').show()
I have a data frame that looks like so:
>>> l = [('a', 'foo', 1), ('b', 'bar', 1), ('a', 'biz', 6), ('c', 'bar', 3), ('c', 'biz', 2)]
>>> df = spark.createDataFrame(l, ('uid', 'code', 'level'))
>>> df.show()
+---+----+-----+
|uid|code|level|
+---+----+-----+
| a| foo| 1|
| b| bar| 1|
| a| biz| 6|
| c| bar| 3|
| c| biz| 2|
+---+----+-----+
What I'm trying to do is group the code and level values into a list of dict and dump that list as a JSON string so that I can save the data frame to disk. The result would look like:
>>> df.show()
+---+--------------------------+
|uid| json |
+---+--------------------------+
| a| '[{"foo":1}, {"biz":6}]' |
| b| '[{"bar":1}]' |
| c| '[{"bar":3}, {"biz":2}]' |
+---+--------------------------+
I'm still pretty new to use PySpark and I'm having a lot of trouble figuring out how to get this result. I almost surely need a groupBy and I've tried implementing this by creating a new StringType column called "json" and then using the pandas_udf decorator but I'm getting errors about unhasable types, because, as I've found out, the way I'm accessing the data is accessing the whole column, not just the row.
>>> df = df.withColumn('json', F.list(''))
>>> schema = df.schema
>>> #pandas_udf(schema, F.PandasUDFType.GROUPED_MAP)
..: def to_json(pdf):
..: return pdf.assign(serial=json.dumps({pdf.code:pdf.level}))
I've considered using string concatenation between the two columns and using collect_set but that feels wrong as well since it has the potential to write to disk that which can't be JSON loaded just because it has a string representation. Any help is appreciated.
There's no need for a pandas_udf in this case. to_json, collect_list and create_map should be all you need:
import pyspark.sql.functions as f
df.groupby('uid').agg(
f.to_json(
f.collect_list(
f.create_map('code', 'level')
)
).alias('json')
).show(3, False)
+---+---------------------+
|uid|json |
+---+---------------------+
|c |[{"bar":3},{"biz":2}]|
|b |[{"bar":1}] |
|a |[{"foo":1},{"biz":6}]|
+---+---------------------+
I want to create a sample single-column DataFrame, but the following code is not working:
df = spark.createDataFrame(["10","11","13"], ("age"))
## ValueError
## ...
## ValueError: Could not parse datatype: age
The expected result:
age
10
11
13
the following code is not working
With single element you need a schema as type
spark.createDataFrame(["10","11","13"], "string").toDF("age")
or DataType:
from pyspark.sql.types import StringType
spark.createDataFrame(["10","11","13"], StringType()).toDF("age")
With name elements should be tuples and schema as sequence:
spark.createDataFrame([("10", ), ("11", ), ("13", )], ["age"])
Well .. There is some pretty easy method for creating sample dataframe in PySpark
>>> df = sc.parallelize([[1,2,3], [2,3,4]]).toDF()
>>> df.show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
| 1| 2| 3|
| 2| 3| 4|
+---+---+---+
to create with some column names
>>> df1 = sc.parallelize([[1,2,3], [2,3,4]]).toDF(("a", "b", "c"))
>>> df1.show()
+---+---+---+
| a| b| c|
+---+---+---+
| 1| 2| 3|
| 2| 3| 4|
+---+---+---+
In this way, no need to define schema too.Hope this is the simplest way
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame([{"a": "x", "b": "y", "c": "3"}])
Output: (no need to define schema)
+---+---+---+
| a | b | c |
+---+---+---+
| x| y| 3|
+---+---+---+
For pandas + pyspark users, if you've already installed pandas in the cluster, you can do this simply:
# create pandas dataframe
df = pd.DataFrame({'col1':[1,2,3], 'col2':['a','b','c']})
# convert to spark dataframe
df = spark.createDataFrame(df)
Local Spark Setup
import findspark
findspark.init()
import pyspark
spark = (pyspark
.sql
.SparkSession
.builder
.master("local")
.getOrCreate())
See my farsante lib for creating a DataFrame with fake data:
import farsante
df = farsante.quick_pyspark_df(['first_name', 'last_name'], 7)
df.show()
+----------+---------+
|first_name|last_name|
+----------+---------+
| Tommy| Hess|
| Arthur| Melendez|
| Clemente| Blair|
| Wesley| Conrad|
| Willis| Dunlap|
| Bruna| Sellers|
| Tonda| Schwartz|
+----------+---------+
Here's how to explicitly specify the schema when creating the PySpark DataFrame:
df = spark.createDataFrame(
[(10,), (11,), (13,)],
StructType([StructField("some_int", IntegerType(), True)]))
df.show()
+--------+
|some_int|
+--------+
| 10|
| 11|
| 13|
+--------+
You can also try something like this -
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc) # sc is the spark context
sample = sqlContext.createDataFrame(
[
('qwe', 23), # enter your data here
('rty',34),
('yui',56),
],
['abc', 'def'] # the row header/column labels should be entered here
There are several ways to create a DataFrame, PySpark Create DataFrame is one of the first steps you learn while working on PySpark
I assume you already have data, columns, and an RDD.
1) df = rdd.toDF()
2) df = rdd.toDF(columns) //Assigns column names
3) df = spark.createDataFrame(rdd).toDF(*columns)
4) df = spark.createDataFrame(data).toDF(*columns)
5) df = spark.createDataFrame(rowData,columns)
Besides these, you can find several examples on pyspark create dataframe
I have a pyspark 2.0.1. I'm trying to groupby my data frame & retrieve the value for all the fields from my data frame. I found that
z=data1.groupby('country').agg(F.collect_list('names'))
will give me values for country & names attribute & for names attribute it will give column header as collect_list(names). But for my job I have dataframe with around 15 columns & I will run a loop & will change the groupby field each time inside loop & need the output for all of the remaining fields.Can you please suggest me how to do it using collect_list() or any other pyspark functions?
I tried this code too
from pyspark.sql import functions as F
fieldnames=data1.schema.names
names1= list()
for item in names:
if item != 'names':
names1.append(item)
z=data1.groupby('names').agg(F.collect_list(names1))
z.show()
but got error message
Py4JError: An error occurred while calling z:org.apache.spark.sql.functions.collect_list. Trace: py4j.Py4JException: Method collect_list([class java.util.ArrayList]) does not exist
Use struct to combine the columns before calling groupBy
suppose you have a dataframe
df = spark.createDataFrame(sc.parallelize([(0,1,2),(0,4,5),(1,7,8),(1,8,7)])).toDF("a","b","c")
df = df.select("a", f.struct(["b","c"]).alias("newcol"))
df.show()
+---+------+
| a|newcol|
+---+------+
| 0| [1,2]|
| 0| [4,5]|
| 1| [7,8]|
| 1| [8,7]|
+---+------+
df = df.groupBy("a").agg(f.collect_list("newcol").alias("collected_col"))
df.show()
+---+--------------+
| a| collected_col|
+---+--------------+
| 0|[[1,2], [4,5]]|
| 1|[[7,8], [8,7]]|
+---+--------------+
Aggregation operation can be done only on single columns.
After aggregation, You can collect the result and iterate over it to separate the combined columns generate the index dict. or you can write a
udf to separate the combined columns.
from pyspark.sql.types import *
def foo(x):
x1 = [y[0] for y in x]
x2 = [y[1] for y in x]
return(x1,x2)
st = StructType([StructField("b", ArrayType(LongType())), StructField("c", ArrayType(LongType()))])
udf_foo = udf(foo, st)
df = df.withColumn("ncol",
udf_foo("collected_col")).select("a",
col("ncol").getItem("b").alias("b"),
col("ncol").getItem("c").alias("c"))
df.show()
+---+------+------+
| a| b| c|
+---+------+------+
| 0|[1, 4]|[2, 5]|
| 1|[7, 8]|[8, 7]|
+---+------+------+
Actually we can do it in pyspark 2.2 .
First we need create a constant column ("Temp"), groupBy with that column ("Temp") and apply agg by pass iterable *exprs in which expression of collect_list exits.
Below is the code:
import pyspark.sql.functions as ftions
import functools as ftools
def groupColumnData(df, columns):
df = df.withColumn("Temp", ftions.lit(1))
exprs = [ftions.collect_list(colName) for colName in columns]
df = df.groupby('Temp').agg(*exprs)
df = df.drop("Temp")
df = df.toDF(*columns)
return df
Input Data:
df.show()
+---+---+---+
| a| b| c|
+---+---+---+
| 0| 1| 2|
| 0| 4| 5|
| 1| 7| 8|
| 1| 8| 7|
+---+---+---+
Output Data:
df.show()
+------------+------------+------------+
| a| b| c|
+------------+------------+------------+
|[0, 0, 1, 1]|[1, 4, 7, 8]|[2, 5, 8, 7]|
+------------+------------+------------+
in spark 2.4.4 and python 3.7 (I guess its also relevant for previous spark and python version) --
My suggestion is a based on pauli's answer,
instead of creating the struct and then using the agg function, create the struct inside collect_list:
df = spark.createDataFrame([(0,1,2),(0,4,5),(1,7,8),(1,8,7)]).toDF("a","b","c")
df.groupBy("a").agg(collect_list(struct(["b","c"])).alias("res")).show()
result :
+---+-----------------+
| a|res |
+---+-----------------+
| 0|[[1, 2], [4, 5]] |
| 1|[[7, 8], [8, 7]] |
+---+-----------------+
I just use Concat_ws function it's perfectly fine.
> from pyspark.sql.functions import * df =
> spark.createDataFrame([(0,1,2),(0,4,5),(1,7,8),(1,8,7)]).toDF("a","b","c")
> df.groupBy('a').agg(collect_list(concat_ws(',','b','c'))).alias('r').show()