split a full name to first name and last name in pyspark? - python

so basically i'm learning pyspark
and i know how to split full name to first name and last name in python
name = "sun moon"
FName = name.split()[0]
LName = name.split()[1]
i want to do this in pyspark file json
{"l":"santee, california, united states","t":"161xxxx","caseN":"888548748565","caseL":"CA","n":"sun moon"}
my code
df = spark.read.json("cases.json")
df.select("l","t","caseN","caseL","n")
df \
.write \
.mode('overwrite') \
.option('header', 'true') \
.csv('cases')
i want to split n to FName and Lname

from pyspark.sql.functions import split
df = spark.read.json("cases.json")
df.select("l","t","caseN","caseL","n")\
.withColumn("FName", split(col("n"), " ").getItem(0))\
.withColumn("LName", split(col("n"), " ").getItem(1))\
.write \
.mode('overwrite') \
.option('header', 'true') \
.csv('cases')

No you can just do it this way:
sname = name.split(" ")
The above line chops the name whenever it encounters space.
Fname = sname[0]
Lname = sname[-1]

from pyspark.sql.functions import split,size,col
df2=(spark
.createDataFrame(
[
("f1 f2","l1 l2 l3"),
("f1","l1 l2"),
("f1 f2 f3","l1 l2 l3 l4")
],
["first_name","last_name"]
)
)
(df2
.withColumn("last_name_size",size(split(col("last_name")," ")))
.withColumn("first_name2",split(col("first_name")," ").getItem(0))
.withColumn("last_name2",split(col("last_name")," ").getItem(col("last_name_size")-1)).show())
The result is
+----------+-----------+--------------+-----------+----------+
|first_name| last_name|last_name_size|first_name2|last_name2|
+----------+-----------+--------------+-----------+----------+
| f1 f2| l1 l2 l3| 3| f1| l3|
| f1| l1 l2| 2| f1| l2|
| f1 f2 f3|l1 l2 l3 l4| 4| f1| l4|
+----------+-----------+--------------+-----------+----------+
what I did was simply create an auxiliary column with last_name split size
and with that it is possible to get the last item in the array

If df is the data frame where customer's full name is in column CUST_NAME (For ex:- VIKRANT R THAKUR) then the following code format will work :-
df1 = df.withColumn('FIRST_NAME', initcap(substring_index('CUST_NAME', ' ', 1)) )
-> VIKRANT
df1 = df.withColumn('LAST_NAME', initcap(substring_index('CUST_NAME',' ',-1)) )
-> THAKUR

Related

Using Pyspark how to convert plain text to csv file

When I created a hive table, the data is as follows.
data file
<__name__>abc
<__code__>1
<__value__>1234
<__name__>abcdef
<__code__>2
<__value__>12345
<__name__>abcdef
<__code__>2
<__value__>12345
1234156321
<__name__>abcdef
<__code__>2
<__value__>12345
...
Can I create a table right away without converting the file?
It's a plain text file, three columns are repeated.
How to convert dataframe? or csv file?
I want
| name | code | value
| abc | 1 | 1234
| abcdef | 2 | 12345
...
or
abc,1,1234
abcdef,2,12345
...
I solved my problem like this.
data = spark.read.text(path)
rows = data.rdd.zipWithIndex().map(lambda x: Row(x[0].value, int(x[1]/3)))
schema = StructType() \
.add("col1",StringType(), False) \
.add("record_pos",IntegerType(), False)
df = spark.createDataFrame(rows, schema)
df1 = df.withColumn("key", regexp_replace(split(df["col1"], '__>')[0], '<|__', '')) \
.withColumn("value", regexp_replace(regexp_replace(split(df["col1"], '__>')[1], '\n', '<NL>'), '\t', '<TAB>'))
dataframe = df1.groupBy("record_pos").pivot("key").agg(first("value")).drop("record_pos")
dataframe.show()
val path = "file:///C:/stackqustions/data/stackq5.csv"
val data = sc.textFile(path)
import spark.implicits._
val rdd = data.zipWithIndex.map {
case (records, index) => Row(records, index / 3)
}
val schema = new StructType().add("col1", StringType, false).add("record_pos", LongType, false)
val df = spark.createDataFrame(rdd, schema)
val df1 = df
.withColumn("key", regexp_replace(split($"col1", ">")(0), "<|__", ""))
.withColumn("value", split($"col1", ">")(1)).drop("col1")
df1.groupBy("record_pos").pivot("key").agg(first($"value")).drop("record_pos").show
result:
+----+------+-----+
|code| name|value|
+----+------+-----+
| 1| abc| 1234|
| 2|abcdef|12345|
| 2|abcdef|12345|
| 2|abcdef|12345|
+----+------+-----+

Read file to determine the rules

I have an excel file with user defined business rules as below:
Column_Name|Operator|Column_Value1|Operand|RuleID|Result
ABC | Equal| 12| and| 1| 1
CDE | Equal| 10| and| 1| 1
XYZ | Equal| AD| | 1| 1.5
ABC | Equal| 11| and| 2| 1
CDE | Equal| 10| | 2| 1.2
and so on. (just for formatting purpose have put | symbol).
Input file (CSV) will look like below:
ABC,CDE,XYZ
12,10,AD
11,10,AD
Goal here is to derive an output column called Result which needs to be looked up to the user defined business rule excel.
Output Expected:
ABC,CDE,XYZ,Result
12,10,AD,1.5
11,10,AD,1.2
I have so far tried to generate an if statement and trying to assign the entire if/elif statement to a function. So that I can pass it to below statement to apply the rules.
ouput_df['result'] = input_df.apply(result_func, axis=1)
When I have the function with manually coding the rules it works as shown below:
def result_func(input_df):
if (input_df['ABC'] == 12):
return '1.25'
elif (ip_df['ABC'] == 11):
return '0.25'
else:
return '1'
Is this the right way of handling this scenario? If so how do I pass the entire dynamically generated if/elif to the function?
Code
import pandas as pd
import csv
# Load rules table
rules_table = []
with open('rules.csv') as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
rules_table.append([x.strip() for x in row.values()])
# Load CSV file into DataFrame
df = pd.read_csv('data.csv', sep=",")
def rules_eval(row, rules):
" Steps through rules table for appropriate value "
def operator_eval(op, col, value):
if op == 'Equal':
return str(row[col]) == str(value)
else:
# Curently only Equal supported
raise ValueError(f"Unsupported Operator Value {op}, only Equal allowed")
prev_rule = '~'
for col, op, val, operand, rule, res in rules:
# loop through rows of rule table
if prev_rule != rule:
# rule ID changed so we can follow rule chains again
ignore_rule = False
if not ignore_rule:
if operator_eval(op, col, val):
if operand != 'and':
return res
else:
# Rule didn't work for an item in group
# ignore subsequent rules with this id
ignore_rule = True
prev_rule = rule
return None
df['results'] = df.apply(lambda row: rules_eval(row, rules_table), axis=1)
print(df)
Output
ABC CDE XYZ results
0 12 10 AD 1.5
1 11 10 AD 1.2
Explanation
df.apply - applies the rules_eval function to each row of the DataFrame.
The output is placed into column 'result' via
df['result'] = ...
Handling Rule Priority
Change
Added a Priority column to the rules_table so rules with the same RuleID are processed in order of priority.
Priority order decided by tuple ordering added to heap, currently
Priority, Column_Name, Operator, Column_Value, Operand, RuleID, Result
Code
import pandas as pd
import csv
from collections import namedtuple
from heapq import (heappush, heappop)
# Load CSV file into DataFrame
df = pd.read_csv('data.csv', sep=",")
class RulesEngine():
###########################################
# Static members
###########################################
# Named tuple for rules
fieldnames = 'Column_Name|Operator|Column_Value1|Operand|RuleID|Priority|Result'
Rule = namedtuple('Rule', fieldnames.replace('|', ' '))
number_fields = fieldnames.count('|') + 1
###########################################
# members
###########################################
def __init__(self, table_file):
# Load rules table
rules_table = []
with open(table_file) as csvfile:
reader = csv.DictReader(csvfile, delimiter='|')
for row in reader:
fields = [self.convert(x.strip()) for x in row.values() if x is not None]
if len(fields) != self.number_fields:
# Incorrect number of values
error = f"Rules require {self.number_fields} fields per row, was given {len(fields)}"
raise ValueError(error)
rules_table.append([self.convert(x.strip()) for x in row.values()])
#rules_table.append([x.strip() for x in row.values()])
self.rules_table = rules_table
def convert(self, s):
" Convert string to (int, float, or leave current value) "
try:
return int(s)
except ValueError:
try:
return float(s)
except ValueError:
return s
def operator_eval(self, row, rule):
" Determines value for a rule "
if rule.Operator == 'Equal':
return str(row[rule.Column_Name]) == str(rule.Column_Value1)
else:
# Curently only Equal supported
error = f"Unsupported Operator {rule.Operator}, only Equal allowed"
raise ValueError(error)
def get_rule_value(self, row, rule_queue):
" Value of a rule or None if no matching rule "
found_match = True
while rule_queue:
priority, rule_to_process = heappop(rule_queue)
if not self.operator_eval(row, rule_to_process):
found_match = False
break
return rule_to_process.Result if found_match else None
def rules_eval(self, row):
" Steps through rules table for appropriate value "
rule_queue = []
for index, r in enumerate(self.rules_table):
# Create named tuple with current rule values
current_rule = self.Rule(*r)
if not rule_queue or \
rule_queue[-1][1].RuleID == current_rule.RuleID:
# note: rule_queue[-1][1].RuleID is previous rule
# Within same rule group or last rule of group
priority = current_rule.Priority
# heap orders rules by pririty
# (lowest numbers are processed first)
heappush(rule_queue, (priority, current_rule))
if index < len(self.rules_table)-1:
continue # not at last rule, so keep accumulating
# Process rules in the rules queue
rule_value = self.get_rule_value(row, rule_queue)
if rule_value:
return rule_value
else:
# Starting over with new rule group
rule_queue = []
priority = current_rule.Priority
heappush(rule_queue, (priority, current_rule))
# Process Final queue if not empty
return self.get_rule_value(row, rule_queue)
# Init rules engine with rules from CSV file
rules_engine = RulesEngine('rules.csv')
df['results'] = df.apply(rules_engine.rules_eval, axis=1)
print(df)
Data Table
ABC,CDE,XYZ
12,10,AD
11,10,AD
12,12,AA
Rules Table
Column_Name|Operator|Column_Value1|Operand|RuleID|Priority|Result
ABC | Equal| 12| and| 1| 2|1
CDE | Equal| 10| and| 1| 1|1
XYZ | Equal| AD| and| 1| 3|1.5
ABC | Equal| 11| and| 2| 1|1
CDE | Equal| 10| foo| 2| 2|1.2
ABC | Equal| 12| foo| 3| 1|1.8
Output
ABC CDE XYZ results
0 12 10 AD 1.5
1 11 10 AD 1.2
2 12 12 AA 1.8

Pyspark DataFrame loop

I am new to Python and DataFrame. Here I am writing a Python code to run an ETL job in AWS Glue. Please find the same code snippet below.
test_DyF = glueContext.create_dynamic_frame.from_catalog(database="teststoragedb", table_name="testtestfile_csv")
test_dataframe = test_DyF.select_fields(['empid','name']).toDF()
now the above test_dataframe is of type pyspark.sql.dataframe.DataFrame
Now, I need to loop through the above test_dataframe. As far as I see, I could see only collect or toLocalIterator. Please find the below sample code
for row_val in test_dataframe.collect():
But both these methods are very slow and not efficient. I cannot use pandas as it is not supported by AWS Glue.
Please find the steps I am doing
source information:
productid|matchval|similar product|similar product matchval
product A|100|product X|100
product A|101|product Y|101
product B|100|product X|100
product C|102|product Z|102
expected result:
product |similar products
product A|product X, product Y
product B|product X
product C|product Z
This is the code I am writing
I am getting a distinct dataframe of the source with productID
Loop through this distinct data frame set
a) get the list of matchval for the product from the source
b) identify the similar product based on matchval filters
c) loop through to get the concatinated string ---> This loop using the rdd.collect is affecting the performance
Can you please share any better suggestion on what can be done?
please elaborate what logic you want to try it out. DF looping can be done via SQL approach or you can also follow below RDD approach
def my_function(each_record):
#my_logic
#loop through for each command.
df.rdd.foreach(my_function)
Added following code further based on your input
df = spark.read.csv("/mylocation/61250775.csv", header=True, inferSchema=True, sep="|")
seq = ['product X','product Y','product Z']
df2 = df.groupBy("productid").pivot("similar_product",seq).count()
+---------+---------+---------+---------+
|productid|product X|product Y|product Z|
+---------+---------+---------+---------+
|product B| 1| null| null|
|product A| 1| 1| null|
|product C| null| null| 1|
+---------+---------+---------+---------+
The final approach which match your requirement
df = spark.read.csv("/mylocation/61250775.csv", header=True, inferSchema=True, sep="|")
df.printSchema()
>>> df.printSchema()
root
|-- id: string (nullable = true)
|-- matchval1: integer (nullable = true)
|-- similar: string (nullable = true)
|-- matchval3: integer (nullable = true)
from pyspark.sql.functions import concat_ws
from pyspark.sql.functions import collect_list
dfx = df.groupBy("id").agg(concat_ws(",", collect_list("similar")).alias("Similar_Items")).select(col("id"), col("Similar_Items"))
dfx.show()
+---------+-------------------+
| id| Similar_Items|
+---------+-------------------+
|product B| product X|
|product A|product X,product Y|
|product C| product Z|
+---------+-------------------+
You can also use the MAP class. In my case, I was iterating through data and calculate hash for the full row.
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
import hashlib
## #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## #type: DataSource
## #args: [database = "load-test", table_name = "table_test", transformation_ctx = "datasource0"]
## #return: datasource0
## #inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "load-test", table_name = "table_test", transformation_ctx = "datasource0")
def hash_calculation(rec):
md5 = hashlib.md5()
md5.update('{}_{}_{}_{}'.format(rec["funcname"],rec["parameter"],rec["paramtype"],rec["structure"]).encode())
rec["hash"] = md5.hexdigest()
print("looping the recs")
return rec
mapped_dyF = Map.apply(frame = datasource0, f = hash_calculation)

How to replace non ascii characters in pyspark dataframe [duplicate]

I need to delete accents from characters in Spanish and others languages from different datasets.
I already did a function based in the code provided in this post that removes special the accents. The problem is that the function is slow because it uses an UDF.
I'm just wondering if I can improve the performance of my function to get results in less time, because this is good for small dataframes but not for big ones.
Thanks in advance.
Here the code, you will be able to run it as it is presented:
# Importing sql types
from pyspark.sql.types import StringType, IntegerType, StructType, StructField
from pyspark.sql.functions import udf, col
import unicodedata
# Building a simple dataframe:
schema = StructType([StructField("city", StringType(), True),
StructField("country", StringType(), True),
StructField("population", IntegerType(), True)])
countries = ['Venezuela', 'US#A', 'Brazil', 'Spain']
cities = ['Maracaibó', 'New York', ' São Paulo ', '~Madrid']
population = [37800000,19795791,12341418,6489162]
# Dataframe:
df = sqlContext.createDataFrame(list(zip(cities, countries, population)), schema=schema)
df.show()
class Test():
def __init__(self, df):
self.df = df
def clearAccents(self, columns):
"""This function deletes accents in strings column dataFrames,
it does not eliminate main characters, but only deletes special tildes.
:param columns String or a list of column names.
"""
# Filters all string columns in dataFrame
validCols = [c for (c, t) in filter(lambda t: t[1] == 'string', self.df.dtypes)]
# If None or [] is provided with column parameter:
if (columns == "*"): columns = validCols[:]
# Receives a string as an argument
def remove_accents(inputStr):
# first, normalize strings:
nfkdStr = unicodedata.normalize('NFKD', inputStr)
# Keep chars that has no other char combined (i.e. accents chars)
withOutAccents = u"".join([c for c in nfkdStr if not unicodedata.combining(c)])
return withOutAccents
function = udf(lambda x: remove_accents(x) if x != None else x, StringType())
exprs = [function(col(c)).alias(c) if (c in columns) and (c in validCols) else c for c in self.df.columns]
self.df = self.df.select(*exprs)
foo = Test(df)
foo.clearAccents(columns="*")
foo.df.show()
One possible improvement is to build a custom Transformer, which will handle Unicode normalization, and corresponding Python wrapper. It should reduce overall overhead of passing data between JVM and Python and doesn't require any modifications in Spark itself or access to private API.
On JVM side you'll need a transformer similar to this one:
package net.zero323.spark.ml.feature
import java.text.Normalizer
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{DataType, StringType}
class UnicodeNormalizer (override val uid: String)
extends UnaryTransformer[String, String, UnicodeNormalizer] {
def this() = this(Identifiable.randomUID("unicode_normalizer"))
private val forms = Map(
"NFC" -> Normalizer.Form.NFC, "NFD" -> Normalizer.Form.NFD,
"NFKC" -> Normalizer.Form.NFKC, "NFKD" -> Normalizer.Form.NFKD
)
val form: Param[String] = new Param(this, "form", "unicode form (one of NFC, NFD, NFKC, NFKD)",
ParamValidators.inArray(forms.keys.toArray))
def setN(value: String): this.type = set(form, value)
def getForm: String = $(form)
setDefault(form -> "NFKD")
override protected def createTransformFunc: String => String = {
val normalizerForm = forms($(form))
(s: String) => Normalizer.normalize(s, normalizerForm)
}
override protected def validateInputType(inputType: DataType): Unit = {
require(inputType == StringType, s"Input type must be string type but got $inputType.")
}
override protected def outputDataType: DataType = StringType
}
Corresponding build definition (adjust Spark and Scala versions to match your Spark deployment):
name := "unicode-normalization"
version := "1.0"
crossScalaVersions := Seq("2.11.12", "2.12.8")
organization := "net.zero323"
val sparkVersion = "2.4.0"
libraryDependencies ++= Seq(
"org.apache.spark" %% "spark-core" % sparkVersion,
"org.apache.spark" %% "spark-sql" % sparkVersion,
"org.apache.spark" %% "spark-mllib" % sparkVersion
)
On Python side you'll need a wrapper similar to this one.
from pyspark.ml.param.shared import *
# from pyspark.ml.util import keyword_only # in Spark < 2.0
from pyspark import keyword_only
from pyspark.ml.wrapper import JavaTransformer
class UnicodeNormalizer(JavaTransformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, form="NFKD", inputCol=None, outputCol=None):
super(UnicodeNormalizer, self).__init__()
self._java_obj = self._new_java_obj(
"net.zero323.spark.ml.feature.UnicodeNormalizer", self.uid)
self.form = Param(self, "form",
"unicode form (one of NFC, NFD, NFKC, NFKD)")
# kwargs = self.__init__._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, form="NFKD", inputCol=None, outputCol=None):
# kwargs = self.setParams._input_kwargs # in Spark < 2.0
kwargs = self._input_kwargs
return self._set(**kwargs)
def setForm(self, value):
return self._set(form=value)
def getForm(self):
return self.getOrDefault(self.form)
Build Scala package:
sbt +package
include it when you start shell or submit. For example for Spark build with Scala 2.11:
bin/pyspark --jars path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar \
--driver-class-path path-to/target/scala-2.11/unicode-normalization_2.11-1.0.jar
and you should be ready to go. All what is left is a little bit of regexp magic:
from pyspark.sql.functions import regexp_replace
normalizer = UnicodeNormalizer(form="NFKD",
inputCol="text", outputCol="text_normalized")
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid")
]).toDF(["id", "text"])
(normalizer
.transform(df)
.select(regexp_replace("text_normalized", "\p{M}", ""))
.show())
## +--------------------------------------+
## |regexp_replace(text_normalized,\p{M},)|
## +--------------------------------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## +--------------------------------------+
Please note that this follows the same conventions as built in text transformers and is not null safe. You can easily correct for that by check for null in createTransformFunc.
Another way for doing using python Unicode Database :
import unicodedata
import sys
from pyspark.sql.functions import translate, regexp_replace
def make_trans():
matching_string = ""
replace_string = ""
for i in range(ord(" "), sys.maxunicode):
name = unicodedata.name(chr(i), "")
if "WITH" in name:
try:
base = unicodedata.lookup(name.split(" WITH")[0])
matching_string += chr(i)
replace_string += base
except KeyError:
pass
return matching_string, replace_string
def clean_text(c):
matching_string, replace_string = make_trans()
return translate(
regexp_replace(c, "\p{M}", ""),
matching_string, replace_string
).alias(c)
So now let's test it :
df = sc.parallelize([
(1, "Maracaibó"), (2, "New York"),
(3, " São Paulo "), (4, "~Madrid"),
(5, "São Paulo"), (6, "Maracaibó")
]).toDF(["id", "text"])
df.select(clean_text("text")).show()
## +---------------+
## | text|
## +---------------+
## | Maracaibo|
## | New York|
## | Sao Paulo |
## | ~Madrid|
## | Sao Paulo|
## | Maracaibo|
## +---------------+
acknowledge #zero323
This solution is Python only, but is only useful if the number of possible accents is low (e.g. one single language like Spanish) and the character replacements are manually specified.
There seems to be no built-in way to do what you asked for directly without UDFs, however you can chain many regexp_replace calls to replace each possible accented character. I tested the performance of this solution and it turns out that it only runs faster if you have a very limited set of accents to replace. If that's the case it can be faster than UDFs because it is optimized outside of Python.
from pyspark.sql.functions import col, regexp_replace
accent_replacements_spanish = [
(u'á', 'a'), (u'Á', 'A'),
(u'é', 'e'), (u'É', 'E'),
(u'í', 'i'), (u'Í', 'I'),
(u'ò', 'o'), (u'Ó', 'O'),
(u'ú|ü', 'u'), (u'Ú|Ű', 'U'),
(u'ñ', 'n'),
# see http://stackoverflow.com/a/18123985/3810493 for other characters
# this will convert other non ASCII characters to a question mark:
('[^\x00-\x7F]', '?')
]
def remove_accents(column):
r = col(column)
for a, b in accent_replacements_spanish:
r = regexp_replace(r, a, b)
return r.alias('remove_accents(' + column + ')')
df = sqlContext.createDataFrame([['Olà'], ['Olé'], ['Núñez']], ['str'])
df.select(remove_accents('str')).show()
I haven't compared the performance with the other responses and this function is not as general, but it is at least worth considering because you don't need to add Scala or Java to your build process.
Here's my implementation.
Apart from accents I also remove speciach characters. Because I needed to pivot and save a table, and you can't save a table with column name that has " ,;{}()\n\t=\/" characters.
import re
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, StringType, StructType, StructField
from unidecode import unidecode
spark = SparkSession.builder.getOrCreate()
data = [(1, " \\ / \\ {____} aŠdá_ \t = \n () asd ____aa 2134_ 23_"), (1, "N"), (2, "false"), (2, "1"), (3, "NULL"),
(3, None)]
schema = StructType([StructField("id", IntegerType(), True), StructField("txt", StringType(), True)])
df = SparkSession.builder.getOrCreate().createDataFrame(data, schema)
df.show()
for col_name in ["txt"]:
tmp_dict = {}
for col_value in [row[0] for row in df.select(col_name).distinct().toLocalIterator()
if row[0] is not None]:
new_col_value = re.sub("[ ,;{}()\\n\\t=\\\/]", "_", col_value)
new_col_value = re.sub('_+', '_', new_col_value)
if new_col_value.startswith("_"):
new_col_value = new_col_value[1:]
if new_col_value.endswith("_"):
new_col_value = new_col_value[:-1]
new_col_value = unidecode(new_col_value)
tmp_dict[col_value] = new_col_value.lower()
df = df.na.replace(to_replace=tmp_dict, subset=[col_name])
df.show()
if you can't access external librares (like me) you can replace unidecode with
new_col_value = new_col_value.translate(str.maketrans(
"ä,ö,ü,ẞ,á,ä,č,ď,é,ě,í,ĺ,ľ,ň,ó,ô,ŕ,š,ť,ú,ů,ý,ž,Ä,Ö,Ü,ẞ,Á,Ä,Č,Ď,É,Ě,Í,Ĺ,Ľ,Ň,Ó,Ô,Ŕ,Š,Ť,Ú,Ů,Ý,Ž",
"a,o,u,s,a,a,c,d,e,e,i,l,l,n,o,o,r,s,t,u,u,y,z,A,O,U,S,A,A,C,D,E,E,I,L,L,N,O,O,R,S,T,U,U,Y,Z"))

How to resolve duplicate column names while joining two dataframes in PySpark?

I have a file A and B which are exactly the same. I am trying to perform inner and outer joins on these two dataframes. Since I have all the columns as duplicate columns, the existing answers were of no help.
The other questions that I have gone through contain a col or two as duplicate, my issue is that the whole files are duplicates of each other: both in data and in column names.
My code:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import DataFrameReader, DataFrameWriter
from datetime import datetime
import time
# #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
print("All imports were successful.")
df = spark.read.orc(
's3://****'
)
print("First dataframe read with headers set to True")
df2 = spark.read.orc(
's3://****'
)
print("Second dataframe read with headers set to True")
# df3 = df.join(df2, ['c_0'], "outer")
# df3 = df.join(
# df2,
# df["column_test_1"] == df2["column_1"],
# "outer"
# )
df3 = df.alias('l').join(df2.alias('r'), on='c_0') #.collect()
print("Dataframes have been joined successfully.")
output_file_path = 's3://****'
)
df3.write.orc(
output_file_path
)
print("Dataframe has been written to csv.")
job.commit()
The error that I am facing is:
pyspark.sql.utils.AnalysisException: u'Duplicate column(s): "c_4", "c_38", "c_13", "c_27", "c_50", "c_16", "c_23", "c_24", "c_1", "c_35", "c_30", "c_56", "c_34", "c_7", "c_46", "c_49", "c_57", "c_45", "c_31", "c_53", "c_19", "c_25", "c_10", "c_8", "c_14", "c_42", "c_20", "c_47", "c_36", "c_29", "c_15", "c_43", "c_32", "c_5", "c_37", "c_18", "c_54", "c_3", "__created_at__", "c_51", "c_48", "c_9", "c_21", "c_26", "c_44", "c_55", "c_2", "c_17", "c_40", "c_28", "c_33", "c_41", "c_22", "c_11", "c_12", "c_52", "c_6", "c_39" found, cannot save to file.;'
End of LogType:stdout
There is no shortcut here. Pyspark expects the left and right dataframes to have distinct sets of field names (with the exception of the join key).
One solution would be to prefix each field name with either a "left_" or "right_" as follows:
# Obtain columns lists
left_cols = df.columns
right_cols = df2.columns
# Prefix each dataframe's field with "left_" or "right_"
df = df.selectExpr([col + ' as left_' + col for col in left_cols])
df2 = df2.selectExpr([col + ' as right_' + col for col in right_cols])
# Perform join
df3 = df.alias('l').join(df2.alias('r'), on='c_0')
Here is a helper function to join two dataframes adding aliases:
def join_with_aliases(left, right, on, how, right_prefix):
renamed_right = right.selectExpr(
[
col + f" as {col}_{right_prefix}"
for col in df2.columns
if col not in on
]
+ on
)
right_on = [f"{x}{right_prefix}" for x in on]
return left.join(renamed_right, on=on, how=how)
and here an example in how to use it:
df1 = spark.createDataFrame([[1, "a"], [2, "b"], [3, "c"]], ("id", "value"))
df2 = spark.createDataFrame([[1, "a"], [2, "b"], [3, "c"]], ("id", "value"))
join_with_aliases(
left=df1,
right=df2,
on=["id"],
how="inner",
right_prefix="_right"
).show()
+---+-----+------------+
| id|value|value_right|
+---+-----+------------+
| 1| a| a|
| 3| c| c|
| 2| b| b|
+---+-----+------------+
I did something like this but in scala, you can convert the same into pyspark as well...
Rename the column names in each dataframe
dataFrame1.columns.foreach(columnName => {
dataFrame1 = dataFrame1.select(dataFrame1.columns.head, dataFrame1.columns.tail: _*).withColumnRenamed(columnName, s"left_$columnName")
})
dataFrame1.columns.foreach(columnName => {
dataFrame2 = dataFrame2.select(dataFrame2.columns.head, dataFrame2.columns.tail: _*).withColumnRenamed(columnName, s"right_$columnName")
})
Now join by mentioning the column names
resultDF = dataframe1.join(dataframe2, dataframe1("left_c_0") === dataframe2("right_c_0"))

Categories

Resources