I have a file A and B which are exactly the same. I am trying to perform inner and outer joins on these two dataframes. Since I have all the columns as duplicate columns, the existing answers were of no help.
The other questions that I have gone through contain a col or two as duplicate, my issue is that the whole files are duplicates of each other: both in data and in column names.
My code:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import DataFrameReader, DataFrameWriter
from datetime import datetime
import time
# #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
print("All imports were successful.")
df = spark.read.orc(
's3://****'
)
print("First dataframe read with headers set to True")
df2 = spark.read.orc(
's3://****'
)
print("Second dataframe read with headers set to True")
# df3 = df.join(df2, ['c_0'], "outer")
# df3 = df.join(
# df2,
# df["column_test_1"] == df2["column_1"],
# "outer"
# )
df3 = df.alias('l').join(df2.alias('r'), on='c_0') #.collect()
print("Dataframes have been joined successfully.")
output_file_path = 's3://****'
)
df3.write.orc(
output_file_path
)
print("Dataframe has been written to csv.")
job.commit()
The error that I am facing is:
pyspark.sql.utils.AnalysisException: u'Duplicate column(s): "c_4", "c_38", "c_13", "c_27", "c_50", "c_16", "c_23", "c_24", "c_1", "c_35", "c_30", "c_56", "c_34", "c_7", "c_46", "c_49", "c_57", "c_45", "c_31", "c_53", "c_19", "c_25", "c_10", "c_8", "c_14", "c_42", "c_20", "c_47", "c_36", "c_29", "c_15", "c_43", "c_32", "c_5", "c_37", "c_18", "c_54", "c_3", "__created_at__", "c_51", "c_48", "c_9", "c_21", "c_26", "c_44", "c_55", "c_2", "c_17", "c_40", "c_28", "c_33", "c_41", "c_22", "c_11", "c_12", "c_52", "c_6", "c_39" found, cannot save to file.;'
End of LogType:stdout
There is no shortcut here. Pyspark expects the left and right dataframes to have distinct sets of field names (with the exception of the join key).
One solution would be to prefix each field name with either a "left_" or "right_" as follows:
# Obtain columns lists
left_cols = df.columns
right_cols = df2.columns
# Prefix each dataframe's field with "left_" or "right_"
df = df.selectExpr([col + ' as left_' + col for col in left_cols])
df2 = df2.selectExpr([col + ' as right_' + col for col in right_cols])
# Perform join
df3 = df.alias('l').join(df2.alias('r'), on='c_0')
Here is a helper function to join two dataframes adding aliases:
def join_with_aliases(left, right, on, how, right_prefix):
renamed_right = right.selectExpr(
[
col + f" as {col}_{right_prefix}"
for col in df2.columns
if col not in on
]
+ on
)
right_on = [f"{x}{right_prefix}" for x in on]
return left.join(renamed_right, on=on, how=how)
and here an example in how to use it:
df1 = spark.createDataFrame([[1, "a"], [2, "b"], [3, "c"]], ("id", "value"))
df2 = spark.createDataFrame([[1, "a"], [2, "b"], [3, "c"]], ("id", "value"))
join_with_aliases(
left=df1,
right=df2,
on=["id"],
how="inner",
right_prefix="_right"
).show()
+---+-----+------------+
| id|value|value_right|
+---+-----+------------+
| 1| a| a|
| 3| c| c|
| 2| b| b|
+---+-----+------------+
I did something like this but in scala, you can convert the same into pyspark as well...
Rename the column names in each dataframe
dataFrame1.columns.foreach(columnName => {
dataFrame1 = dataFrame1.select(dataFrame1.columns.head, dataFrame1.columns.tail: _*).withColumnRenamed(columnName, s"left_$columnName")
})
dataFrame1.columns.foreach(columnName => {
dataFrame2 = dataFrame2.select(dataFrame2.columns.head, dataFrame2.columns.tail: _*).withColumnRenamed(columnName, s"right_$columnName")
})
Now join by mentioning the column names
resultDF = dataframe1.join(dataframe2, dataframe1("left_c_0") === dataframe2("right_c_0"))
Related
I have two databases in Snowflake, DB1 & DB2. the data is migrated from DB1 to DB2, so the schema and the table names are the same.
Assume DB1.SCHEMA_1.TABLE_1 has this data:
STATE_ID STATE
1 AL
2 AN
3 AZ
4 AR
5 CA
6 AD
7 PN
8 AP
9 JH
10 TX
12 LA
and
Assume DB2.SCHEMA_1.TABLE_1 has this data:
STATE_ID STATE
1 AL
2 AK
3 AZ
4 AR
5 AC
6 AD
7 GP
8 AP
9 JH
10 HA
They both have one more column 'record_created_timestamp' but I drop it in the code.
I wrote a pyspark script that would perform Column based comparison that would run in Aws Glue job. I got help from this link: Generate a report of mismatch Columns between 2 Pyspark dataframes
My code is :
import sys
from pyspark.sql.session import SparkSession
from pyspark.sql import Row
from pyspark.sql.functions import concat, col, lit, to_timestamp, when
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from py4j.java_gateway import java_import
import os
from pyspark.sql.types import *
from pyspark.sql.functions import substring
from pyspark.sql.functions import array, count, first
import json
import datetime
import time
import boto3
from botocore.exceptions import ClientError
now = datetime.datetime.now()
year = now.strftime("%Y")
month = now.strftime("%m")
day = now.strftime("%d")
glueClient = boto3.client('glue')
ssmClient = boto3.client('ssm')
region = os.environ['AWS_DEFAULT_REGION']
SNOWFLAKE_SOURCE_NAME = "net.snowflake.spark.snowflake"
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'CONNECTION_INFO', 'TABLE_NAME', 'BUCKET_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
client = boto3.client("secretsmanager", region_name="us-east-1")
get_secret_value_response = client.get_secret_value(
SecretId=args['CONNECTION_INFO']
)
secret = get_secret_value_response['SecretString']
secret = json.loads(secret)
db_username = secret.get('db_username')
db_password = secret.get('db_password')
db_warehouse = secret.get('db_warehouse')
db_url = secret.get('db_url')
db_account = secret.get('db_account')
db_name = secret.get('db_name')
db_schema = secret.get('db_schema')
logger = glueContext.get_logger()
logger.info('Fetching configuration.')
job.init(args['JOB_NAME'], args)
java_import(spark._jvm, SNOWFLAKE_SOURCE_NAME)
spark._jvm.net.snowflake.spark.snowflake.SnowflakeConnectorUtils.enablePushdownSession(spark._jvm.org.apache.spark.sql.SparkSession.builder().getOrCreate())
sfOptions = {
"sfURL" : db_url,
"sfAccount" : db_account,
"sfUser" : db_username,
"sfPassword" : db_password,
"sfSchema" : db_schema,
"sfDatabase" : db_name,
"sfWarehouse" : db_warehouse
}
print(f'database: {db_name}')
print(f'db_warehouse: {db_warehouse}')
print(f'db_schema: {db_schema}')
print(f'db_account: {db_account}')
table_name = args['TABLE_NAME']
bucket_name = args['BUCKET_NAME']
MySql_1 = f"""
select * from DB1.SCHEMA_1.TABLE_1
"""
df = spark.read.format("snowflake").options(**sfOptions).option("query", MySql_1).load()
df1 = df.drop('record_created_timestamp')
MySql_2 = f"""
select * from DB2.SCHEMA_1.TABLE_1
"""
df2 = spark.read.format("snowflake").options(**sfOptions).option("query", MySql_2).load()
df3 = df.drop('record_created_timestamp')
# list of columns to be compared
cols = df1.columns[1:]
df_new = (df1.join(df3, "state_id", "outer")
.select([ when(~df1[c].eqNullSafe(df3[c]), array(df1[c], df3[c])).alias(c) for c in cols ])
.selectExpr('stack({},{}) as (Column_Name, mismatch)'.format(len(cols), ','.join('"{0}",`{0}`'.format(c) for c in cols)))
.filter('mismatch is not NULL'))
df_newv1 = df_new.selectExpr('Column_Name', 'mismatch[0] as Mismatch_In_DB1_Table', 'mismatch[1] as Mismatch_In_DB2_Table')
df_newv1.show()
SNOWFLAKE_SOURCE_NAME = "snowflake"
job.commit()
This provides me the correct output:
Column_Name Mismatch_In_DB1_Table Mismatch_In_DB2_Table
STATE AN AK
STATE CA AC
STATE PN GP
STATE TX HA
If I use STATE instead of STATE_ID to outer join
df_new = (df1.join(df2, "state", "outer")
It shows this error.
AnalysisException: 'Resolved attribute(s) STATE#1,STATE#9 missing from STATE#14,STATE_ID#0,STATE_ID#8 in operator !Project [CASE WHEN NOT (STATE#1 <=> STATE#9) THEN array(STATE#1, STATE#9) END AS STATE#18]. Attribute(s) with the same name appear in the operation: STATE,STATE. Please check if the right attribute(s) are used.;;\n!Project [CASE WHEN NOT (STATE#1 <=> STATE#9) THEN array(STATE#1, STATE#9) END AS STATE#18]\n+- Project [coalesce(STATE#1, STATE#9) AS STATE#14, STATE_ID#0, STATE_ID#8]\n +- Join FullOuter, (STATE#1 = STATE#9)\n :- Project [STATE_ID#0, STATE#1]\n : +- Relation[STATE_ID#0,STATE#1,RECORD_CREATED_TIMESTAMP#2] SnowflakeRelation\n +- Relation[STATE_ID#8,STATE#9] SnowflakeRelation\n
I would appreciate an explanation regarding this and want to know if there is a way this could run even if I give STATE as the key.
or
If there is some other code via which I can get the same output without this error, that would help too.
Seems that Spark is getting confused with the column names of both dfs. Try to give alias to them to make sure they match:
df1 = df.drop('record_created_timestamp')\
.select(df.STATE_ID.alias('state_id'), df.STATE.alias('state'))
df3 = df.drop('record_created_timestamp')\
.select(df.STATE_ID.alias('state_id'), df.STATE.alias('state'))
Also, make sure '''STATE_ID''' has no space/special charachters in this column's name
When I created a hive table, the data is as follows.
data file
<__name__>abc
<__code__>1
<__value__>1234
<__name__>abcdef
<__code__>2
<__value__>12345
<__name__>abcdef
<__code__>2
<__value__>12345
1234156321
<__name__>abcdef
<__code__>2
<__value__>12345
...
Can I create a table right away without converting the file?
It's a plain text file, three columns are repeated.
How to convert dataframe? or csv file?
I want
| name | code | value
| abc | 1 | 1234
| abcdef | 2 | 12345
...
or
abc,1,1234
abcdef,2,12345
...
I solved my problem like this.
data = spark.read.text(path)
rows = data.rdd.zipWithIndex().map(lambda x: Row(x[0].value, int(x[1]/3)))
schema = StructType() \
.add("col1",StringType(), False) \
.add("record_pos",IntegerType(), False)
df = spark.createDataFrame(rows, schema)
df1 = df.withColumn("key", regexp_replace(split(df["col1"], '__>')[0], '<|__', '')) \
.withColumn("value", regexp_replace(regexp_replace(split(df["col1"], '__>')[1], '\n', '<NL>'), '\t', '<TAB>'))
dataframe = df1.groupBy("record_pos").pivot("key").agg(first("value")).drop("record_pos")
dataframe.show()
val path = "file:///C:/stackqustions/data/stackq5.csv"
val data = sc.textFile(path)
import spark.implicits._
val rdd = data.zipWithIndex.map {
case (records, index) => Row(records, index / 3)
}
val schema = new StructType().add("col1", StringType, false).add("record_pos", LongType, false)
val df = spark.createDataFrame(rdd, schema)
val df1 = df
.withColumn("key", regexp_replace(split($"col1", ">")(0), "<|__", ""))
.withColumn("value", split($"col1", ">")(1)).drop("col1")
df1.groupBy("record_pos").pivot("key").agg(first($"value")).drop("record_pos").show
result:
+----+------+-----+
|code| name|value|
+----+------+-----+
| 1| abc| 1234|
| 2|abcdef|12345|
| 2|abcdef|12345|
| 2|abcdef|12345|
+----+------+-----+
And I need to extract from utc_timestamp its date and its hour into two different columns depending on time zone. Time zone name is defined by id from configuration const variable.
Input DF Output DF
+-------------+--+ +-------------+--+----------+----+
|utc_timestamp|id| |utc_timestamp|id|date |hour|
+-------------+--+ +-------------+--+----------+----|
|1608000000782|1 | |1608000000782|1 |2020-12-14|20 |
+-------------+--+ +-------------+--+----------+----+
|1608000240782|2 | |1608000240782|2 |2020-12-15|11 |
+-------------+--+ +-------------+--+----------+----+
I have pandas_udf that allows me to extract one column at a time and I have to create it twice:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import DateType, IntegerType
import pandas as pd
import pytz
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
class TimezoneUdfProvider(object):
def __init__(self):
self.extract_date_udf = pandas_udf(self._extract_date, DateType(), PandasUDFType.SCALAR)
self.extract_hour_udf = pandas_udf(self._extract_hour, IntegerType(), PandasUDFType.SCALAR)
def _extract_date(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_date(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def _extract_hour(self, utc_timestamps: pd.Series, ids: pd.Series) -> pd.Series:
return pd.Series([extract_hour(c1, c2) for c1, c2 in zip(utc_timestamps, ids)])
def extract_date(utc_timestamp: int, id: str):
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).date()
def extract_hour(utc_timestamp: int, id: str) -> int:
timezone_name = TIMEZONE_LIST[id]
timezone_nw = pytz.timezone(timezone_name)
return pd.datetime.fromtimestamp(utc_timestamp / 1000e00, tz=timezone_nw).hour
def extract_from_utc(df: DataFrame) -> DataFrame:
timezone_udf1 = TimezoneUdfProvider()
df_with_date = df.withColumn('date', timezone_udf1.extract_date_udf(f.col(utc_timestamp), f.col(id)))
timezone_udf2 = TimezoneUdfProvider()
df_with_hour = df_with_date.withColumn('hour', timezone_udf2.extract_hour_udf(f.col(utc_timestamp), f.col(id)))
return df_with_hour
Is there a better way to do it? Without a need to use the same udf provider twice?
you can do this without using udf using spark inbuilt functions.
We can use create_map to map the dictionary and create new timezone column , then convert using from_unixtime and from_utc_timestamp using the timezone as the newly mapped column. Once we have the timestamp as per the timezones, we can then fetch Hour and date feilds.
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
import pyspark.sql.functions as F
from itertools import chain
map_exp = F.create_map([F.lit(i) for i in chain(*TIMEZONE_LIST.items())])
final = (df.withColumn("TimeZone", map_exp.getItem(col("id")))
.withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()
(3) Spark Jobs
final:pyspark.sql.dataframe.DataFrame = [utc_timestamp: long, id: long ... 3 more fields]
+-------------+---+---------------+----------+----+
|utc_timestamp| id| TimeZone| date|Hour|
+-------------+---+---------------+----------+----+
|1608000000782| 1|America/Chicago|2020-12-14| 20|
|1608000240782| 2| Asia/Tokyo|2020-12-15| 11|
+-------------+---+---------------+----------+----+
EDIT: replacing create_map with a udf:
import pyspark.sql.functions as F
from pyspark.sql.functions import StringType
TIMEZONE_LIST = {1: 'America/Chicago', 2: 'Asia/Tokyo'}
def fun(x):
return TIMEZONE_LIST.get(x,None)
map_udf = F.udf(fun,StringType())
final = (df.withColumn("TimeZone", map_udf("id")).withColumn("Timestamp",
F.from_utc_timestamp(F.from_unixtime(F.col("utc_timestamp")/1000),F.col("TimeZone")))
.withColumn("date",F.to_date("Timestamp")).withColumn("Hour",F.hour("Timestamp"))
.drop("Timestamp"))
final.show()
so basically i'm learning pyspark
and i know how to split full name to first name and last name in python
name = "sun moon"
FName = name.split()[0]
LName = name.split()[1]
i want to do this in pyspark file json
{"l":"santee, california, united states","t":"161xxxx","caseN":"888548748565","caseL":"CA","n":"sun moon"}
my code
df = spark.read.json("cases.json")
df.select("l","t","caseN","caseL","n")
df \
.write \
.mode('overwrite') \
.option('header', 'true') \
.csv('cases')
i want to split n to FName and Lname
from pyspark.sql.functions import split
df = spark.read.json("cases.json")
df.select("l","t","caseN","caseL","n")\
.withColumn("FName", split(col("n"), " ").getItem(0))\
.withColumn("LName", split(col("n"), " ").getItem(1))\
.write \
.mode('overwrite') \
.option('header', 'true') \
.csv('cases')
No you can just do it this way:
sname = name.split(" ")
The above line chops the name whenever it encounters space.
Fname = sname[0]
Lname = sname[-1]
from pyspark.sql.functions import split,size,col
df2=(spark
.createDataFrame(
[
("f1 f2","l1 l2 l3"),
("f1","l1 l2"),
("f1 f2 f3","l1 l2 l3 l4")
],
["first_name","last_name"]
)
)
(df2
.withColumn("last_name_size",size(split(col("last_name")," ")))
.withColumn("first_name2",split(col("first_name")," ").getItem(0))
.withColumn("last_name2",split(col("last_name")," ").getItem(col("last_name_size")-1)).show())
The result is
+----------+-----------+--------------+-----------+----------+
|first_name| last_name|last_name_size|first_name2|last_name2|
+----------+-----------+--------------+-----------+----------+
| f1 f2| l1 l2 l3| 3| f1| l3|
| f1| l1 l2| 2| f1| l2|
| f1 f2 f3|l1 l2 l3 l4| 4| f1| l4|
+----------+-----------+--------------+-----------+----------+
what I did was simply create an auxiliary column with last_name split size
and with that it is possible to get the last item in the array
If df is the data frame where customer's full name is in column CUST_NAME (For ex:- VIKRANT R THAKUR) then the following code format will work :-
df1 = df.withColumn('FIRST_NAME', initcap(substring_index('CUST_NAME', ' ', 1)) )
-> VIKRANT
df1 = df.withColumn('LAST_NAME', initcap(substring_index('CUST_NAME',' ',-1)) )
-> THAKUR
I am new to Python and DataFrame. Here I am writing a Python code to run an ETL job in AWS Glue. Please find the same code snippet below.
test_DyF = glueContext.create_dynamic_frame.from_catalog(database="teststoragedb", table_name="testtestfile_csv")
test_dataframe = test_DyF.select_fields(['empid','name']).toDF()
now the above test_dataframe is of type pyspark.sql.dataframe.DataFrame
Now, I need to loop through the above test_dataframe. As far as I see, I could see only collect or toLocalIterator. Please find the below sample code
for row_val in test_dataframe.collect():
But both these methods are very slow and not efficient. I cannot use pandas as it is not supported by AWS Glue.
Please find the steps I am doing
source information:
productid|matchval|similar product|similar product matchval
product A|100|product X|100
product A|101|product Y|101
product B|100|product X|100
product C|102|product Z|102
expected result:
product |similar products
product A|product X, product Y
product B|product X
product C|product Z
This is the code I am writing
I am getting a distinct dataframe of the source with productID
Loop through this distinct data frame set
a) get the list of matchval for the product from the source
b) identify the similar product based on matchval filters
c) loop through to get the concatinated string ---> This loop using the rdd.collect is affecting the performance
Can you please share any better suggestion on what can be done?
please elaborate what logic you want to try it out. DF looping can be done via SQL approach or you can also follow below RDD approach
def my_function(each_record):
#my_logic
#loop through for each command.
df.rdd.foreach(my_function)
Added following code further based on your input
df = spark.read.csv("/mylocation/61250775.csv", header=True, inferSchema=True, sep="|")
seq = ['product X','product Y','product Z']
df2 = df.groupBy("productid").pivot("similar_product",seq).count()
+---------+---------+---------+---------+
|productid|product X|product Y|product Z|
+---------+---------+---------+---------+
|product B| 1| null| null|
|product A| 1| 1| null|
|product C| null| null| 1|
+---------+---------+---------+---------+
The final approach which match your requirement
df = spark.read.csv("/mylocation/61250775.csv", header=True, inferSchema=True, sep="|")
df.printSchema()
>>> df.printSchema()
root
|-- id: string (nullable = true)
|-- matchval1: integer (nullable = true)
|-- similar: string (nullable = true)
|-- matchval3: integer (nullable = true)
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import collect_list
dfx = df.groupBy("id").agg(concat_ws(",", collect_list("similar")).alias("Similar_Items")).select(col("id"), col("Similar_Items"))
dfx.show()
+---------+-------------------+
| id| Similar_Items|
+---------+-------------------+
|product B| product X|
|product A|product X,product Y|
|product C| product Z|
+---------+-------------------+
You can also use the MAP class. In my case, I was iterating through data and calculate hash for the full row.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import hashlib
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource
## #args: [database = "load-test", table_name = "table_test", transformation_ctx = "datasource0"]
## #return: datasource0
## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "load-test", table_name = "table_test", transformation_ctx = "datasource0")
def hash_calculation(rec):
md5 = hashlib.md5()
md5.update('{}_{}_{}_{}'.format(rec["funcname"],rec["parameter"],rec["paramtype"],rec["structure"]).encode())
rec["hash"] = md5.hexdigest()
print("looping the recs")
return rec
mapped_dyF = Map.apply(frame = datasource0, f = hash_calculation)