With a dataframe as follows:
from pyspark.sql.functions import avg, first
rdd = sc.parallelize(
[
(0, "A", 223,"201603", "PORT"),
(0, "A", 22,"201602", "PORT"),
(0, "A", 422,"201601", "DOCK"),
(1,"B", 3213,"201602", "DOCK"),
(1,"B", 3213,"201601", "PORT"),
(2,"C", 2321,"201601", "DOCK")
]
)
df_data = sqlContext.createDataFrame(rdd, ["id","type", "cost", "date", "ship"])
df_data.show()
I do a pivot on it,
df_data.groupby(df_data.id, df_data.type).pivot("date").agg(avg("cost"), first("ship")).show()
+---+----+----------------+--------------------+----------------+--------------------+----------------+--------------------+
| id|type|201601_avg(cost)|201601_first(ship)()|201602_avg(cost)|201602_first(ship)()|201603_avg(cost)|201603_first(ship)()|
+---+----+----------------+--------------------+----------------+--------------------+----------------+--------------------+
| 2| C| 2321.0| DOCK| null| null| null| null|
| 0| A| 422.0| DOCK| 22.0| PORT| 223.0| PORT|
| 1| B| 3213.0| PORT| 3213.0| DOCK| null| null|
+---+----+----------------+--------------------+----------------+--------------------+----------------+--------------------+
But I get these really complicated names for the columns. Applying alias on the aggregation usually works, but because of the pivot in this case the names are even worse:
+---+----+--------------------------------------------------------------+------------------------------------------------------------------+--------------------------------------------------------------+------------------------------------------------------------------+--------------------------------------------------------------+------------------------------------------------------------------+
| id|type|201601_(avg(cost),mode=Complete,isDistinct=false) AS cost#1619|201601_(first(ship)(),mode=Complete,isDistinct=false) AS ship#1620|201602_(avg(cost),mode=Complete,isDistinct=false) AS cost#1619|201602_(first(ship)(),mode=Complete,isDistinct=false) AS ship#1620|201603_(avg(cost),mode=Complete,isDistinct=false) AS cost#1619|201603_(first(ship)(),mode=Complete,isDistinct=false) AS ship#1620|
+---+----+--------------------------------------------------------------+------------------------------------------------------------------+--------------------------------------------------------------+------------------------------------------------------------------+--------------------------------------------------------------+------------------------------------------------------------------+
| 2| C| 2321.0| DOCK| null| null| null| null|
| 0| A| 422.0| DOCK| 22.0| PORT| 223.0| PORT|
| 1| B| 3213.0| PORT| 3213.0| DOCK| null| null|
+---+----+--------------------------------------------------------------+------------------------------------------------------------------+--------------------------------------------------------------+------------------------------------------------------------------+--------------------------------------------------------------+------------------------------------------------------------------+
Is there a way to rename the column names on the fly on the pivot and aggregation?
You can alias the aggregations directly:
pivoted = df_data \
.groupby(df_data.id, df_data.type) \
.pivot("date") \
.agg(
avg('cost').alias('cost'),
first("ship").alias('ship')
)
pivoted.printSchema()
##root
##|-- id: long (nullable = true)
##|-- type: string (nullable = true)
##|-- 201601_cost: double (nullable = true)
##|-- 201601_ship: string (nullable = true)
##|-- 201602_cost: double (nullable = true)
##|-- 201602_ship: string (nullable = true)
##|-- 201603_cost: double (nullable = true)
##|-- 201603_ship: string (nullable = true)
A simple regular expression should do the trick:
import re
def clean_names(df):
p = re.compile("^(\w+?)_([a-z]+)\((\w+)\)(?:\(\))?")
return df.toDF(*[p.sub(r"\1_\3", c) for c in df.columns])
pivoted = df_data.groupby(...).pivot(...).agg(...)
clean_names(pivoted).printSchema()
## root
## |-- id: long (nullable = true)
## |-- type: string (nullable = true)
## |-- 201601_cost: double (nullable = true)
## |-- 201601_ship: string (nullable = true)
## |-- 201602_cost: double (nullable = true)
## |-- 201602_ship: string (nullable = true)
## |-- 201603_cost: double (nullable = true)
## |-- 201603_ship: string (nullable = true)
If you want to preserve function name you change substitution pattern to for example \1_\2_\3.
A simple approach will be using alias after the aggregate function.
I start with the df_data spark dataFrame you created.
df_data.groupby(df_data.id, df_data.type).pivot("date").agg(avg("cost").alias("avg_cost"), first("ship").alias("first_ship")).show()
+---+----+---------------+-----------------+---------------+-----------------+---------------+-----------------+
| id|type|201601_avg_cost|201601_first_ship|201602_avg_cost|201602_first_ship|201603_avg_cost|201603_first_ship|
+---+----+---------------+-----------------+---------------+-----------------+---------------+-----------------+
| 1| B| 3213.0| PORT| 3213.0| DOCK| null| null|
| 2| C| 2321.0| DOCK| null| null| null| null|
| 0| A| 422.0| DOCK| 22.0| PORT| 223.0| PORT|
+---+----+---------------+-----------------+---------------+-----------------+---------------+-----------------+
column names will be the form of "original_column_name_aliased_column_name". For your case, original_column_name will be 201601, aliased_column_name will be avg_cost, and the column name is 201601_avg_cost(linked by underscore "_").
Wrote an easy and fast function to do this. Enjoy! :)
# This function efficiently rename pivot tables' urgly names
def rename_pivot_cols(rename_df, remove_agg):
"""change spark pivot table's default ugly column names at ease.
Option 1: remove_agg = True: `2_sum(sum_amt)` --> `sum_amt_2`.
Option 2: remove_agg = False: `2_sum(sum_amt)` --> `sum_sum_amt_2`
"""
for column in rename_df.columns:
if remove_agg == True:
start_index = column.find('(')
end_index = column.find(')')
if (start_index > 0 and end_index > 0):
rename_df = rename_df.withColumnRenamed(column, column[start_index+1:end_index]+'_'+column[:1])
else:
new_column = column.replace('(','_').replace(')','')
rename_df = rename_df.withColumnRenamed(column, new_column[2:]+'_'+new_column[:1])
return rename_df
Modification version from zero323 , for spark 2.4
import re
def clean_names(df):
p = re.compile("^(\w+?)_([a-z]+)\((\w+)(,\s\w+)\)(:\s\w+)?")
return df.toDF(*[p.sub(r"\1_\3", c) for c in df.columns])
current column name is like 0_first(is_flashsale, false): int
Related
I have a column of date and a column of count.
eg:
Date Count:
3/07/2010 1
2/01/2010 2
1/07/2012 5
I used the code below to change to the data type to date:
func = udf (lambda x: datetime.strptime(x, '%d/%m/%Y'), DateType())
crime_mongodb_df = crime_mongodb_df.withColumn('Reported Date', func(col('Reported Date')))
Then, I want to group the data by year and find the total count per year. I am not sure how to do the grouping.
Can I get some help? Thanks!
We can use functions from pyspark.sql.functions to do all of this, including type change quite easily :)
from pyspark.sql.functions import to_date, col, year
df = spark.createDataFrame([('3/07/2012', 1), ('2/07/2010', 2), ('1/07/2010', 5)], ["Date", "Count"])
df.show()
df.printSchema()
+---------+-----+
| Date|Count|
+---------+-----+
|3/07/2012| 1|
|2/07/2010| 2|
|1/07/2010| 5|
+---------+-----+
root
|-- Date: string (nullable = true)
|-- Count: long (nullable = true)
adjustedDf = df.withColumn("Date", to_date(col("Date"), "d/MM/yyyy"))\
.withColumn('year', year("Date"))
adjustedDf.show()
+----------+-----+----+
| Date|Count|year|
+----------+-----+----+
|2012-07-03| 1|2012|
|2010-07-02| 2|2010|
|2010-07-01| 5|2010|
+----------+-----+----+
adjustedDf.groupBy("year").sum("Count").show()
+----+----------+
|year|sum(Count)|
+----+----------+
|2010| 7|
|2012| 1|
+----+----------+
I have a dataframe and I'm doing this:
df = dataframe.withColumn("test", lit(0.4219759403))
I want to get just the first four numbers after the dot, without rounding.
When I cast to DecimalType, with .cast(DataTypes.createDecimalType(20,4)
or even with round function, this number is rounded to 0.4220.
The only way that I found without rounding is applying the function format_number(), but this function gives me a string, and when I cast this string to DecimalType(20,4), the framework rounds the number again to 0.4220.
I need to convert this number to DecimalType(20,4) without rounding, and I expect to see 0.4219.
If you have numbers with more than 1 digit before the decimal point, the substr is not adapt. Instead, you can use a regex to always extract the first 4 decimal digits (if present).
You can do this using regexp_extract
df = dataframe.withColumn('rounded', F.regexp_extract(F.col('test'), '\d+\.\d{0,4}', 0))
Example
import pyspark.sql.functions as F
dataframe = spark.createDataFrame([
(0.4219759403, ),
(0.4, ),
(1.0, ),
(0.5431293, ),
(123.769859, )
], ['test'])
df = dataframe.withColumn('rounded', F.regexp_extract(F.col('test'), '\d+\.\d{0,4}', 0))
df.show()
+------------+--------+
| test| rounded|
+------------+--------+
|0.4219759403| 0.4219|
| 0.4| 0.4|
| 1.0| 1.0|
| 0.5431293| 0.5431|
| 123.769859|123.7698|
+------------+--------+
Hi welcome to stackoverflow,
please next time try to provide a reproducible example with the code you tried, anyways this works for me:
from pyspark.sql.types import DecimalType
df = spark.createDataFrame([
(1, "a"),
(2, "b"),
(3, "c"),
], ["ID", "Text"])
df = df.withColumn("test", lit(0.4219759403))
df = df.withColumn("test_string", F.substring(df["test"].cast("string"), 0, 6))
df = df.withColumn("test_string_decimaltype", df["test_string"].cast(DecimalType(20,4)))
df.show()
df.printSchema()
+---+----+------------+-----------+-----------------------+
| ID|Text| test|test_string|test_string_decimaltype|
+---+----+------------+-----------+-----------------------+
| 1| a|0.4219759403| 0.4219| 0.4219|
| 2| b|0.4219759403| 0.4219| 0.4219|
| 3| c|0.4219759403| 0.4219| 0.4219|
+---+----+------------+-----------+-----------------------+
root
|-- ID: long (nullable = true)
|-- Text: string (nullable = true)
|-- test: double (nullable = false)
|-- test_string: string (nullable = false)
|-- test_string_decimaltype: decimal(20,4) (nullable = true)
Of course if you want you can overwrite the same column by putting always "test", i choose different names to let you see the steps.
I would like to create an empty Dataframe and the schema should match to an existing Pyspark Dataframe .I tried using Structtype manually .
To create an empty dataframe call spark.createDataFrame with empty array and providing the schema object from the original dataframe:
df = spark.createDataFrame([(1, 1)], ('foo', 'bar'))
df.printSchema()
# root
# |-- foo: long (nullable = true)
# |-- bar: long (nullable = true)
df.show()
# +---+---+
# |foo|bar|
# +---+---+
# | 1| 1|
# +---+---+
empty_df = spark.createDataFrame([], df.schema)
empty_df.printSchema()
# root
# |-- foo: long (nullable = true)
# |-- bar: long (nullable = true)
empty_df.show()
# +---+---+
# |foo|bar|
# +---+---+
# +---+---+
The following is a sample of my Spark DataFrame with the printSchema below it:
+--------------------+---+------+------+--------------------+
| device_id|age|gender| group| apps|
+--------------------+---+------+------+--------------------+
|-9073325454084204615| 24| M|M23-26| null|
|-8965335561582270637| 28| F|F27-28|[1.0,1.0,1.0,1.0,...|
|-8958861370644389191| 21| M| M22-|[4.0,0.0,0.0,0.0,...|
|-8956021912595401048| 21| M| M22-| null|
|-8910497777165914301| 25| F|F24-26| null|
+--------------------+---+------+------+--------------------+
only showing top 5 rows
root
|-- device_id: long (nullable = true)
|-- age: integer (nullle = true)
|-- gender: string (nullable = true)
|-- group: string (nullable = true)
|-- apps: vector (nullable = true)
I'm trying to fill the null in the 'apps' column with np.zeros(19237). However When I execute
df.fillna({'apps': np.zeros(19237)}))
I get an error
Py4JJavaError: An error occurred while calling o562.fill.
: java.lang.IllegalArgumentException: Unsupported value type java.util.ArrayList
Or if I try
df.fillna({'apps': DenseVector(np.zeros(19237)})))
I get an error
AttributeError: 'numpy.ndarray' object has no attribute '_get_object_id'
Any ideas?
DataFrameNaFunctions support only a subset of native (no UDTs) types, so you'll need an UDF here.
from pyspark.sql.functions import coalesce, col, udf
from pyspark.ml.linalg import Vectors, VectorUDT
def zeros(n):
def zeros_():
return Vectors.sparse(n, {})
return udf(zeros_, VectorUDT())()
Example usage:
df = spark.createDataFrame(
[(1, Vectors.dense([1, 2, 3])), (2, None)],
("device_id", "apps"))
df.withColumn("apps", coalesce(col("apps"), zeros(3))).show()
+---------+-------------+
|device_id| apps|
+---------+-------------+
| 1|[1.0,2.0,3.0]|
| 2| (3,[],[])|
+---------+-------------+
I have a Spark 1.5.0 DataFrame with a mix of null and empty strings in the same column. I want to convert all empty strings in all columns to null (None, in Python). The DataFrame may have hundreds of columns, so I'm trying to avoid hard-coded manipulations of each column.
See my attempt below, which results in an error.
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
## Create a test DataFrame
testDF = sqlContext.createDataFrame([Row(col1='foo', col2=1), Row(col1='', col2=2), Row(col1=None, col2='')])
testDF.show()
## +----+----+
## |col1|col2|
## +----+----+
## | foo| 1|
## | | 2|
## |null|null|
## +----+----+
## Try to replace an empty string with None/null
testDF.replace('', None).show()
## ValueError: value should be a float, int, long, string, list, or tuple
## A string value of null (obviously) doesn't work...
testDF.replace('', 'null').na.drop(subset='col1').show()
## +----+----+
## |col1|col2|
## +----+----+
## | foo| 1|
## |null| 2|
## +----+----+
It is as simple as this:
from pyspark.sql.functions import col, when
def blank_as_null(x):
return when(col(x) != "", col(x)).otherwise(None)
dfWithEmptyReplaced = testDF.withColumn("col1", blank_as_null("col1"))
dfWithEmptyReplaced.show()
## +----+----+
## |col1|col2|
## +----+----+
## | foo| 1|
## |null| 2|
## |null|null|
## +----+----+
dfWithEmptyReplaced.na.drop().show()
## +----+----+
## |col1|col2|
## +----+----+
## | foo| 1|
## +----+----+
If you want to fill multiple columns you can for example reduce:
to_convert = set([...]) # Some set of columns
reduce(lambda df, x: df.withColumn(x, blank_as_null(x)), to_convert, testDF)
or use comprehension:
exprs = [
blank_as_null(x).alias(x) if x in to_convert else x for x in testDF.columns]
testDF.select(*exprs)
If you want to specifically operate on string fields please check the answer by robin-loxley.
UDFs are not terribly efficient. The correct way to do this using a built-in method is:
df = df.withColumn('myCol', when(col('myCol') == '', None).otherwise(col('myCol')))
Simply add on top of zero323's and soulmachine's answers. To convert for all StringType fields.
from pyspark.sql.types import StringType
string_fields = []
for i, f in enumerate(test_df.schema.fields):
if isinstance(f.dataType, StringType):
string_fields.append(f.name)
My solution is much better than all the solutions I'v seen so far, which can deal with as many fields as you want, see the little function as the following:
// Replace empty Strings with null values
private def setEmptyToNull(df: DataFrame): DataFrame = {
val exprs = df.schema.map { f =>
f.dataType match {
case StringType => when(length(col(f.name)) === 0, lit(null: String).cast(StringType)).otherwise(col(f.name)).as(f.name)
case _ => col(f.name)
}
}
df.select(exprs: _*)
}
You can easily rewrite the function above in Python.
I learned this trick from #liancheng
If you are using python u can check the following.
+----+-----+----+
| id| name| age|
+----+-----+----+
|null|name1| 50|
| 2| | |
| |name3|null|
+----+-----+----+
def convertToNull(dfa):
for i in dfa.columns:
dfa = dfa.withColumn(i , when(col(i) == '', None ).otherwise(col(i)))
return dfa
convertToNull(dfa).show()
+----+-----+----+
| id| name| age|
+----+-----+----+
|null|name1| 50|
| 2| null|null|
|null|name3|null|
+----+-----+----+
I would add a trim to #zero323's solution to deal with cases of multiple white spaces:
def blank_as_null(x):
return when(trim(col(x)) != "", col(x))
Thanks to #zero323 , #Tomerikoo and #Robin Loxley
Ready to use function:
def convert_blank_to_null(df, cols=None):
from pyspark.sql.functions import col, when, trim
from pyspark.sql.types import StringType
def blank_as_null(x):
return when(trim(col(x)) == "", None).otherwise(col(x))
# Don't know how to parallel
for f in (df.select(cols) if cols else df).schema.fields:
if isinstance(f.dataType, StringType):
df = df.withColumn(f.name, blank_as_null(f.name))
return df
This is a different version of soulmachine's solution, but I don't think you can translate this to Python as easily:
def emptyStringsToNone(df: DataFrame): DataFrame = {
df.schema.foldLeft(df)(
(current, field) =>
field.dataType match {
case DataTypes.StringType =>
current.withColumn(
field.name,
when(length(col(field.name)) === 0, lit(null: String)).otherwise(col(field.name))
)
case _ => current
}
)
}