Intersection of two data frames with different columns in Pyspark - python

I am new to Data Science and I am working on a simple self project using Google Colab. I took a data from a something1.csv file and something2.csv file.
df1 = spark.read.csv('something1.csv', header=True)
df2 = spark.read.csv('something2.csv', header=True)
The data of something1.csv looks like
#+----------+---------+----------+--------+-------+
#| country| Latitude| Longitude| col1 | col2 |
#+----------+---------+----------+--------+-------+
#| Andorra| 42.506| 1.5218| 2 | 1 |
#| Australia| -31.81| 115.86| 1 | 6 |
#| Austria| 41.597| 12.580| 4 | 9 |
#| Belgium| 21.782| 1.286| 2 | 3 |
#| India| 78.389| 12.972| 1 | 7 |
#| Ireland| 9.281| 9.286| 9 | 8 |
#| USA| 69.371| 21.819| 7 | 2 |
#+----------+---------+----------+--------+-------+
The data of something2.csv looks like
#+----------+---------+----------+--------+-------+
#| country| Latitude| Longitude| col1 | col2 |
#+----------+---------+----------+--------+-------+
#| Australia| -31.81| 115.86| 2 | 6 |
#| Belgium| 21.782| 1.286| 1 | 6 |
#| India| 78.389| 12.972| 3 | 5 |
#| USA| 69.371| 21.819| 2 | 5 |
#+----------+---------+----------+--------+-------+
Now I want to intersect df2 with df1 based on Longitude and Latitude and get the rows that are present in df1 along with col1 and col2 from df1. My table should look like
#+----------+---------+----------+--------+-------+
#| country| Latitude| Longitude| col1 | col2 |
#+----------+---------+----------+--------+-------+
#| Australia| -31.81| 115.86| 1 | 6 |
#| Belgium| 21.782| 1.286| 2 | 3 |
#| India| 78.389| 12.972| 1 | 7 |
#| USA| 69.371| 21.819| 7 | 2 |
#+----------+---------+----------+--------+-------+
I tried using the following code but didn't work.
new_df = df1.intersect(df2) #using the intersection in pyspark which gave me null table
Then I also tried based on Latitude and Longitude
new_df = df2.select('Latitude','Longitude').intersect(df1.select('Latitude','Logitude')) #intersecting based on columns
I tried both the above methods in pyspark but didn't work.

Intersect only gets rows that are common in both dataframes.
But in your case you need col1,col2 from df1 and other columns from df2, Join the dataframes (left/inner as per requirement) and select only col1,col2 from df1 and other columns from df2.
(or) As mentioned in comments by Mohammad Murtaza Hashmi Use left_semi join
Example:
#using left semi join
df1.join(df2,['Latitude','Longitude'],'left_semi').show()
#using left join
df2.alias("t2").join(df1.alias("t1"),['Latitude','Longitude'],'left').select("t2.country","t2.Latitude","t2.Longitude","t1.col1","t1.col2").show()
#+---------+--------+---------+----+----+
#| country|Latitude|Longitude|col1|col2|
#+---------+--------+---------+----+----+
#|Australia| -31.81| 115.86| 1| 6|
#| Belgium| 21.782| 1.286| 2| 3|
#| India| 78.389| 12.972| 1| 7|
#| USA| 69.371| 21.819| 7| 2|
#+---------+--------+---------+----+----+
Dynamic way:
#join columns
join_cols=[x for x in df1.columns if x.startswith("L")]
#selecting cols from t1
t1_cols=["t1."+x for x in df1.columns if x.startswith("col")]
#selecting cols from t2
t2_cols=["t2."+x for x in df2.columns if not x.startswith("col")]
df2.alias("t2").join(df1.alias("t1"),['Latitude','Longitude'],'inner').select(*t2_cols + t1_cols).show()
#+---------+--------+---------+----+----+
#| country|Latitude|Longitude|col1|col2|
#+---------+--------+---------+----+----+
#|Australia| -31.81| 115.86| 1| 6|
#| Belgium| 21.782| 1.286| 2| 3|
#| India| 78.389| 12.972| 1| 7|
#| USA| 69.371| 21.819| 7| 2|
#+---------+--------+---------+----+----+

Related

Pyspark: GroupBy multiple columns and calculate group number

I have a dataframe like:
id Name Rank Course
1 S1 21 Physics
2 S2 22 Chemistry
3 S3 24 Math
4 S2 22 English
5 S2 22 Social
6 S1 21 Geography
I want to group this dataset over Name, Rank and calculate group number. In pandas, I can easily do:
df['ngrp'] = df.groupby(['Name', 'Rank']).ngroup()
After computing the above, I get the following output:
id Name Rank Course ngrp
1 S1 21 Physics 0
6 S1 22 Geography 0
2 S2 22 Chemistry 1
4 S2 22 English 1
5 S2 23 Social 1
3 S3 24 Math 2
Is there a method in Pyspark that will achieve the same output? I tried the following, but it doesn't seem to work:
from pyspark.sql import Window
w = Window.partitionBy(['Name', 'Rank'])
df.select(['Name', 'Rank'], ['Course'], f.count(['Name', 'Rank']).over(w).alias('ngroup')).show()
You can opt for DENSE_RANK -
Data Preparation
df = pd.read_csv(StringIO("""
id,Name,Rank,Course
1,S1,21,Physics
2,S2,22,Chemistry
3,S3,24,Math
4,S2,22,English
5,S2,22,Social
6,S1,21,Geography
"""),delimiter=',')
sparkDF = sql.createDataFrame(df)
sparkDF.show()
+---+----+----+---------+
| id|Name|Rank| Course|
+---+----+----+---------+
| 1| S1| 21| Physics|
| 2| S2| 22|Chemistry|
| 3| S3| 24| Math|
| 4| S2| 22| English|
| 5| S2| 22| Social|
| 6| S1| 21|Geography|
+---+----+----+---------+
Dense Rank
window = Window.orderBy(['Name','Rank'])
sparkDF = sparkDF.withColumn('ngroup',F.dense_rank().over(window) - 1)
sparkDF.orderBy(['Name','ngroup']).show()
+---+----+----+---------+------+
| id|Name|Rank| Course|ngroup|
+---+----+----+---------+------+
| 6| S1| 21|Geography| 0|
| 1| S1| 21| Physics| 0|
| 4| S2| 22| English| 1|
| 2| S2| 22|Chemistry| 1|
| 5| S2| 22| Social| 1|
| 3| S3| 24| Math| 2|
+---+----+----+---------+------+
Dense Rank - SparkSQL
sql.sql("""
SELECT
ID,
NAME,
RANK,
COURSE,
DENSE_RANK() OVER(ORDER BY NAME,RANK) - 1 as NGROUP
FROM TB1
""").show()
+---+----+----+---------+------+
| ID|NAME|RANK| COURSE|NGROUP|
+---+----+----+---------+------+
| 1| S1| 21| Physics| 0|
| 6| S1| 21|Geography| 0|
| 2| S2| 22|Chemistry| 1|
| 4| S2| 22| English| 1|
| 5| S2| 22| Social| 1|
| 3| S3| 24| Math| 2|
+---+----+----+---------+------+

Conditional calculation with two datasets - PySpark

Imagine you have two datasets df and df2 like the following:
df:
ID Size Condition
1 2 1
2 3 0
3 5 0
4 7 1
df2:
aux_ID Scalar
1 2
3 2
I want to get an output where if the condition of df is 1, we multiply the size times the scalar and then return df with the changed values.
I would want to do this as efficient as possible, perhaps avoiding the join if that's possible.
output_df:
ID Size Condition
1 4 1
2 3 0
3 5 0
4 7 1
Not sure why would you want to avoid Joins in the first place. They can be efficient in there own regards.
With this said , this can be easily done with Merging the 2 datasets and building a case-when statement against the condition
Data Preparation
df1 = pd.read_csv(StringIO("""ID,Size,Condition
1,2,1
2,3,0
3,5,0
4,7,1
""")
,delimiter=','
)
df2 = pd.read_csv(StringIO("""aux_ID,Scalar
1,2
3,2
""")
,delimiter=','
)
sparkDF1 = sql.createDataFrame(df1)
sparkDF2 = sql.createDataFrame(df2)
sparkDF1.show()
+---+----+---------+
| ID|Size|Condition|
+---+----+---------+
| 1| 2| 1|
| 2| 3| 0|
| 3| 5| 0|
| 4| 7| 1|
+---+----+---------+
sparkDF2.show()
+------+------+
|aux_ID|Scalar|
+------+------+
| 1| 2|
| 3| 2|
+------+------+
Case When
finalDF = sparkDF1.join(sparkDF2
,sparkDF1['ID'] == sparkDF2['aux_ID']
,'left'
).select(sparkDF1['*']
,sparkDF2['Scalar']
,sparkDF2['aux_ID']
).withColumn('Size_Changed',F.when( ( F.col('Condition') == 1 )
& ( F.col('aux_ID').isNotNull())
,F.col('Size') * F.col('Scalar')
).otherwise(F.col('Size')
)
)
finalDF.show()
+---+----+---------+------+------+------------+
| ID|Size|Condition|Scalar|aux_ID|Size_Changed|
+---+----+---------+------+------+------------+
| 1| 2| 1| 2| 1| 4|
| 3| 5| 0| 2| 3| 5|
| 2| 3| 0| null| null| 3|
| 4| 7| 1| null| null| 7|
+---+----+---------+------+------+------------+
You can drop the unnecessary columns , I kept them for your illustration

how to group by only specific features using pyspark

I have this data frame
+---------+------+-----+-------------+-----+
| LCLid|KWH/hh|Acorn|Acorn_grouped|Month|
+---------+------+-----+-------------+-----+
|MAC000002| 0.0| 0| 0| 10|
|MAC000002| 0.0| 0| 0| 10|
|MAC000002| 0.0| 0| 0| 10|
I want to group by the LCid and month's average consumption only in a certain way that a get
+---------+-----+------------------+----------+------------------+
| LCLid|Month| sum(KWH/hh)|Acorn |Acorn_grouped |
+---------+-----+------------------+----------+------------------+
|MAC000003| 10| 904.9270009999999| 0 | 0 |
|MAC000022| 2|1672.5559999999978| 1 | 0 |
|MAC000019| 4| 368.4720001000007| 1 | 1 |
|MAC000022| 9|449.07699989999975| 0 | 1 |
|MAC000024| 8| 481.7160003000004| 1 | 0 |
but what I could do is using this code
dataset=dataset.groupBy("LCLid","Month").sum()
which gave me this result
+---------+-----+------------------+----------+------------------+----------+
| LCLid|Month| sum(KWH/hh)|sum(Acorn)|sum(Acorn_grouped)|sum(Month)|
+---------+-----+------------------+----------+------------------+----------+
|MAC000003| 10| 904.9270009999999| 2978| 2978| 29780|
|MAC000022| 2|1672.5559999999978| 12090| 4030| 8060|
|MAC000019| 4| 368.4720001000007| 20174| 2882| 11528|
|MAC000022| 9|449.07699989999975| 8646| 2882| 25938|
the problem is that the sum function was calculated also on the acron and acron_grouped
have you any idea how could I make the grouping only on the KWH/hh
Depends on how you want to handle the other two columns. If you don't want to sum them, and just want any value from that column, you can do
import pyspark.sql.functions as F
dataset = dataset.groupBy("LCLid","Month").agg(
F.sum("KWH/hh"),
F.first("Acorn").alias("Acorn"),
F.first("Acorn_grouped").alias("Acorn_grouped")
)

Adding column with rolling latest prior in PySpark

I have a pyspark dataframe with a list of customers, days, and transaction types.
+----------+-----+------+
| Customer | Day | Type |
+----------+-----+------+
| A | 2 | X11 |
| A | 4 | X2 |
| A | 9 | Y4 |
| A | 11 | X1 |
| B | 3 | Y4 |
| B | 7 | X1 |
+----------+-----+------+
I'd like to create a column that has "most recent X type" for each customer, like so:
+----------+-----+------+-------------+
| Customer | Day | Type | MostRecentX |
+----------+-----+------+-------------+
| A | 2 | X11 | X11 |
| A | 4 | X2 | X2 |
| A | 9 | Y4 | X2 |
| A | 11 | X1 | X1 |
| B | 3 | Y4 | - |
| B | 7 | X1 | X1 |
+----------+-----+------+-------------+
So for the X types it just takes the one from the current row, but for the Y type it takes the type from the most recent X row for that member (and if there isn't one, it gets a blank or something). I imagine I need a sort of window function but not very familiar with PySpark.
You can achieve by this taking the last column that startswith the letter "X" over a Window that partitions by the Customer and orders by the Day. Specify the Window to start at the beginning of the partition and stop at the current row.
from pyspark.sql import Window
from pyspark.sql.functions import col, last, when
w = Window.partitionBy("Customer").orderBy("Day").rowsBetween(Window.unboundedPreceding, 0)
df = df.withColumn(
"MostRecentX",
last(when(col("Type").startswith("X"), col("Type")), ignorenulls=True).over(w)
)
df.show()
#+--------+---+----+-----------+
#|Customer|Day|Type|MostRecentX|
#+--------+---+----+-----------+
#| A| 2| X11| X11|
#| A| 4| X2| X2|
#| A| 9| Y4| X2|
#| A| 11| X1| X1|
#| B| 3| Y4| null|
#| B| 7| X1| X1|
#+--------+---+----+-----------+
The trick here is to use when to return the Type column only if it starts with "X". By default, when will return null. Then we can use last with ignorenulls=True to get the value for MostRecentX.
If you want to replace the null with "-" as shown in your question, just call fillna on the MostRecentX column:
df.fillna("-", subset=["MostRecentX"]).show()
#+--------+---+----+-----------+
#|Customer|Day|Type|MostRecentX|
#+--------+---+----+-----------+
#| A| 2| X11| X11|
#| A| 4| X2| X2|
#| A| 9| Y4| X2|
#| A| 11| X1| X1|
#| B| 3| Y4| -|
#| B| 7| X1| X1|
#+--------+---+----+-----------+

Transpose mutiple columns in a Pyspark dataframe with a condition

I have a spark dataframe that looks like this.
id cd1 version1 dt1 cd2 version2 dt2 cd3 version3 dt3
1 100 1 20100101 101 1 20100101 102 20100301
1 101 1 20100102 102 20100201 100 1 20100302
2 201 1 20100103 100 1 20100301 100 1 20100303
2 202 2 20100104 100 1 20100105
I need to transpose all the codes into a single column with the following conditions
If the corresponding version code is 1, add a decimal point after the first digit
Each patient should have distinct codes
For the above example, the output should look like this.
id code dt
1 1.00 20100101
1 1.01 20100101
1 102 20100301
1 1.01 20100102
1 102 20100201
1 10.0 20100302
2 2.01 20100103
2 1.00 20100301
2 1.00 20100303
2 202 20100104
2 10.0 20100105
I am using Pyspark to do this. In the above example, I have shown only 3 codes with their corresponding version columns but I have 30 such columns. Also, this data has around 25 million rows.
Any ideas on how to accomplish this will be extremely helpful.
You can explode a list of these columns so that there is only one (cd, version) pair per line
First, let's create the dataframe:
df = sc.parallelize([[1,100,1,101,1,102,None],[1,101,1,102,None,100,1],[2,201,1,100,1,100,1],
[2,202,2,100,1,None,None]]).toDF(["id","cd1","version1","cd2","version2","cd3","version3"])
Using posexplode:
import pyspark.sql.functions as psf
from itertools import chain
nb_versions = 4
df = df.na.fill(-1).select(
"id",
psf.posexplode(psf.create_map(list(chain(*[(psf.col("cd" + str(i)), psf.col("version"+str(i))) for i in range(1, nb_versions)])))).alias("pos", "cd", "version")
).drop("pos").filter("cd != -1")
+---+---+-------+
| id| cd|version|
+---+---+-------+
| 1|100| 1|
| 1|101| 1|
| 1|102| -1|
| 1|101| 1|
| 1|102| -1|
| 1|100| 1|
| 2|201| 1|
| 2|100| 1|
| 2|100| 1|
| 2|202| 2|
| 2|100| 1|
+---+---+-------+
Using explode:
nb_versions = 4
df = df.select(
"id",
psf.explode(psf.array(
[psf.struct(
psf.col("cd" + str(i)).alias("cd"),
psf.col("version" + str(i)).alias("version")) for i in range(1, nb_versions)])).alias("temp"))\
.select("id", "temp.*")
+---+----+-------+
| id| cd|version|
+---+----+-------+
| 1| 100| 1|
| 1| 101| 1|
| 1| 102| null|
| 1| 101| 1|
| 1| 102| null|
| 1| 100| 1|
| 2| 201| 1|
| 2| 100| 1|
| 2| 100| 1|
| 2| 202| 2|
| 2| 100| 1|
| 2|null| null|
+---+----+-------+
Now we can implement your conditions
division by 100 for version==1
distinct values
We'll use functions when, otherwise for the condition and distinct:
df.withColumn("cd", psf.when(df.version == 1, df.cd/100).otherwise(df.cd))\
.distinct().drop("version")
+---+-----+
| id| cd|
+---+-----+
| 1| 1.0|
| 1| 1.01|
| 1|102.0|
| 2| 1.0|
| 2| 2.01|
| 2|202.0|
+---+-----+
This is how I did it. I am sure there is a better way to do this.
def process_code(raw_data):
for i in range(1,4):
cd_col_name = "cd" + str(i)
version_col_name = "version" + str(i)
raw_data = raw_data.withColumn("mod_cd" + str(i), when(raw_data[version_col_name] == 1, concat(substring(raw_data[cd_col_name],1,1),lit("."),substring(raw_data[cd_col_name],2,20))).otherwise(raw_data[cd_col_name]))
mod_cols = [col for col in raw_data.columns if 'mod_cd' in col]
nb_versions = 3
new = raw_data.fillna('9999', subset=mod_cols).select("id", psf.posexplode(psf.create_map(list(chain(*[(psf.col("mod_cd" + str(i)), psf.col("dt"+str(i))) for i in range(1, nb_versions)])))).alias("pos", "final_cd", "final_date")).drop("pos")
return new
test = process_code(df)
test = test.filter(test.final_cd != '9999')
test.show(100, False)

Categories

Resources