Read bytes file using pyspark - python

I have a file contains bytes information like below with delimiter : '#*'
b'\x00\x00V\x97'#*b'2%'#*b'\x00\x00'#*b'\xc5'#*b'\t'#*b'\xc0'
I want to read this data using Pyspark and each column has different conversion methods to convert into ascii.
I have read this file using String Type() but when I'm doing conversion it is throwing error
'Attribute Error: 'str' object has no attribute 'decode''
customSchema = StructType(
[
StructField("Col1", StringType(), True),
StructField("Col2", StringType(), True),
StructField("Col3", StringType(), True),
]
)
df = (
spark.read.format("csv")
.option("inferSchema", "true")
.option("header", "true")
.schema(customSchema)
.option("sep", "#*")
.load("/FileStore/tables/EbcdicTextData.txt")
)
df.show()
# col1 col2 col3
# |b'\x00\x01d0'|b'I\x08'|b'\x00\x00'|
def unpack_ch_or_zd(bytes: bytearray) -> str:
ascii_text = bytes.decode("cp037").replace("\x00", "").rstrip()
return ascii_text if ascii_text.isascii() else "Non-ASCII"
def unpack_pd_or_pd_plus(bytes) -> str:
ascii_text = (
"" if bytes.hex()[-1:] != "d" and bytes.hex()[-1:] != "b" else "-"
) + bytes.hex()[:-1]
return ascii_text if ascii_text.isascii() else "Non-ASCII"
def unpack_pd_or_pd_plus_dec(bytes, decimal: int) -> str:
ascii_text = (
"" if bytes.hex()[-1:] != "d" and bytes.hex()[-1:] != "b" else "-"
) + bytes.hex()[:-1]
ascii_text = (
ascii_text[:-decimal] + "." + ascii_text[-decimal:]
if ascii_text.isascii()
else "Non-ASCII"
)
return ascii_text
def unpack_bi_or_biplus_no_dec(bytes) -> str:
# elif (type.lower() == "bi" or ( type.lower() == "bi+" and bytes.hex() <= HighestPositive[:len(bytes) * 2])) and decimal == 0:
a = str(int("0x" + bytes.hex(), 0))
return a if a.isascii() else "Non-ASCII"
unpack_ch_or_zd_UDF = udf(lambda x: unpack_ch_or_zd(x), StringType())
unpack_pd_or_pd_plus_UDF = udf(lambda x: unpack_pd_or_pd_plus(x), StringType())
unpack_pd_or_pd_plus_dec_UDF = udf(lambda x: unpack_pd_or_pd_plus_dec(x), StringType())
Layout_df is a dataframe that Contains column names and conversion Type of the particular column:
for row in layout_df.collect():
column_name = row["Field_name"]
conversion_type = row["Python_Data_type"]
if conversion_type.lower() == "ch" or conversion_type.lower() == "zd":
df = df.withColumn(column_name, unpack_ch_or_zd_UDF(col(column_name)))
elif (
conversion_type.lower() == "pd" or conversion_type.lower() == "pd+"
) and decimal == 0:
df = df.withColumn(column_name, unpack_pd_or_pd_plus_UDF(col(column_name)))
elif (
conversion_type.lower() == "pd" or conversion_type.lower() == "pd+"
) and decimal > 0:
df = df.withColumn(column_name, unpack_pd_or_pd_plus_dec_UDF(col(column_name)))

Related

Unable to run Python script in PowerBI

I need to connect with API which requires a SHA from a token and session id.
I call for token and session id, however to make a call with that I need a Python script to run. I wanted to use the table from the authorization call as a parameter in the Python script:
let
api_key = "_api_key",
Source = Json.Document(Web.Contents("https://xxxxxxxxxxx/api/public/json/_Api/auth_call/_api_method/getToken?_api_auth=key", [ApiKeyName = "api_key"])),
#"Converted to Table" = Record.ToTable(Source),
Value = #"Converted to Table"{0}[Value],
#"Converted to Table1" = Record.ToTable(Value),
#"Transposed Table" = Table.Transpose(#"Converted to Table1"),
#"Promoted Headers" = Table.PromoteHeaders(#"Transposed Table", [PromoteAllScalars=true]),
#"Changed Type" = Table.TransformColumnTypes(#"Promoted Headers",{{"token", type text}, {"session_id", type text}}),
#"Run Python script" = Python.Execute("# 'dataset' holds the input data for this script#(lf)#(lf)import requests#(lf)import pandas as pd#(lf)from hashlib import sha1#(lf)#(lf)def LSGetData(sufix, limit=all, object_type='contact', additionallData ={}):#(lf) host = ""https://xxxxxxxxxxxxx/api/public/json/""#(lf) authHost = ""_Api/auth_call/_api_method/getToken""#(lf) api_key = 'xxxxxxxxxxxxx'#(lf) api_secret = 'xxxxxxxxxxxxx'#(lf)#(lf) data = {'_api_auth': ""key"", '_api_key': api_key}#(lf)#(lf) token = dataset['token']#(lf) session_id = dataset['session_id']#(lf)#(lf) api_sha = sha1((api_key + token + api_secret).encode('utf-8')).hexdigest()#(lf) data = {'_api_auth': ""key"", '_api_key': api_key, '_api_sha': api_sha, '_api_session': session_id, 'limit': limit}#(lf)#(lf) if sufix == 'kontakty':#(lf) data['type'] = 'contact'#(lf) elif sufix == 'firmy':#(lf) data['type'] = 'company'#(lf) elif sufix == 'przestrzenie':#(lf) data['type'] = 'space'#(lf) elif sufix == 'tablica':#(lf) data['object type'] = object_type # 'contact', 'company'. 'deal', 'space'#(lf)#(lf) for key, data in additionallData.keys():#(lf) data[key] = data#(lf)#(lf) lsq = requests.post(host + typeOfData[sufix], data=data)#(lf) return lsq.json()#(lf)#(lf)#(lf)def CreateDataFrame(lsdata):#(lf) if sufix == 'kontakty':#(lf) df = pd.DataFrame(lsdata['data']['contact'])#(lf) elif sufix == 'firmy':#(lf) df = pd.DataFrame(lsdata['data']['company'])#(lf) elif sufix == 'szanse':#(lf) df = pd.DataFrame(lsdata['data']['deal'])#(lf) elif sufix == 'tablica':#(lf) df = pd.DataFrame(lsdata['data']['items'])#(lf) elif sufix == 'zadania':#(lf) df = pd.DataFrame(lsdata['data']['todo'])#(lf) elif sufix == 'przestrzenie':#(lf) df = pd.DataFrame(lsdata['data']['space'])#(lf) return df#(lf)#(lf)#(lf)def DataFiler(dataFrame):#(lf) #filter dataFrame here#(lf) dataFrame = dataFrame[dataFrame.owner_name.notnull()]#(lf) return dataFrame#(lf)#(lf)typeOfData = {#(lf) 'kontakty': 'Contact/getAll',#(lf) 'firmy': 'Contact/getAll',#(lf) 'szanse': 'Deal/getAll',#(lf) 'przestrzenie': 'Space/getAll',#(lf) 'zadania': 'Todo/getTodoObjects',#(lf) 'tablica': 'Wall/getList'#(lf)}#(lf)#(lf)sufix = 'firmy'#(lf)#(lf)lsdata = LSGetData(sufix)#(lf)lsDataFrame = CreateDataFrame(lsdata)#(lf)fileredlsLSDataFrame = DataFiler(lsDataFrame)#(lf)#(lf)print(lsDataFrame)",[dataset=#"Changed Type"])
in
#"Run Python script"
Unfortunately this doesn't work.
How do I use "dataset" in this script to make it work?

Output more than one excel/csv file from a class. OOP. Class Inheritance

I am getting a AttributeError as follows:
self.filtered_df.to_excel(self.output_filepath, index=False)
AttributeError: 'tuple' object has no attribute 'to_excel'
I am inheriting a class for another class I am developing, currently the inheritance allows me to output one excel file, can I change the method in this class to allow me to output more than one excel file in the new class I am developing?
Here is the class, inherited:
class ReportQueryCommand(LogCommand):
"""Performs reports through a ReportQueryStrategy instance.
It is possible to overwrite existing queries; this means that it is possible
to perform subsequent filters by launching multiple commands having
key_input = strategy.name.
Attributes:
state: DatasetState containing so-far computed DataFrames.
strategy: defines how to perform the query.
filepath: optional. Defines where to save the queried DataFrame.
key_input: optional (None if df_to_filter is not None).
Defines a key that allows to access a DataFrame in the State.
df_to_filter: optional (None if key is not None).
DataFrame to apply the query on.
"""
def __init__(self,
strategy: ReportQueryStrategy,
base_path: Optional[str] = None,
key_input: Optional[str] = None,
key_output: Optional[str] = None,
df_to_filter: Optional[pd.DataFrame] = None,
filepath: Optional[str] = None,
directory_path: Optional[str] = None,
write_output: bool = True):
super().__init__(base_path=base_path,
directory_path=directory_path,
copy_before_update=False,
write_output=write_output)
self.state = DatasetState()
self.strategy = strategy
self.filepath = filepath
self.key_input = key_input
self._key_output = key_output
self.df_to_filter = df_to_filter
self.filtered_df = None
#property
def _output_file(self) -> str:
return self.strategy.output_filename
def write_to_file(self):
if self.filtered_df is None:
raise ValueError("Missing computed dataframe")
self.filtered_df.to_excel(self.output_filepath, index=False)
#property
def key_output(self) -> str:
# first scenario: key was defined
if self._key_output is not None:
return self._key_output
# second scenario: key was not defined, by default,
# concatenate key_input and strategy
strategy_output_name = self.strategy.output_name
if self.key_input is not None:
return f"{self.key_input}-{strategy_output_name}"
# WCS: no key defined, just assign the strategy key
return strategy_output_name
def execute(self, output_from_previous: Any = None, **kwargs) -> Any:
self.filtered_df = self.strategy.transform(key=self.key_input,
df=self.df_to_filter)
self.state.query_reports[self.key_output] = self.filtered_df
super().execute(output_from_previous=output_from_previous, **kwargs)
As you can see here the method I need to update is 'def write_to_file(self):'
Here is snippets of the relevant code in development to show where this problems 'could be occuring'. The following bits of code below are relevant, this might need to be updated to allow me to output two excel files or more:
class TtestStrategy(ReportQueryStrategy):
"""
"""
#staticmethod
def _strategy_key():
return 't-test', 'fisher-test'
def __init__(self,
query_name: Optional[str] = None,
reference_query: str = None,
sample_query: str = None,
alpha: float = 0.05,
chemical_key: str = 'chemical',
plate_key: str = 'plate',
value_key: str = 'processed_relative_fp',
group_column: str = 'Lot',
return_pivoted: bool = True):
super().__init__(query_name=query_name)
self.pvalues = []
self.alpha = alpha
self.chemical_key, self.plate_key, self.value_key, self.reference_query, self.sample_query = (chemical_key,
plate_key,
value_key,
reference_query,
sample_query)
self.group_column = group_column
self.return_pivoted = return_pivoted
def fishers_method(self, pvalues) -> tuple[bool, float, float, float]:
pvalues = [item for item in pvalues if not pd.isnull(item)]
comb_z, comb_pval = stats.combine_pvalues(pvalues, method="fisher")
k = len(pvalues)
mean_FDR = (self.alpha * (k + 1)) / (2 * k)
if comb_pval > mean_FDR:
decision = False
else:
decision = True
return decision, comb_z, comb_pval, mean_FDR
def transform(self,
key: Optional[str] = None,
df: Optional[pd.DataFrame] = None) -> tuple[Any, DataFrame]:
counter_nosig = 0
counter_sig_st_t = 0
pval_store_st_t = {}
pval_store_st_t_dec = {}
pval_store_st_t_val = {}
pval_store_st_t_val_adj = {}
pval_store_norm_A = {}
pval_store_norm_B = {}
var_store_A = {}
var_store_B = {}
pval_store_lev_bart = {}
decisions = []
st_pval_arr = []
df = super().transform(key=key, df=df)
df = df.loc[(df[self.group_column] == self.reference_query) | (df[self.group_column] == self.sample_query)]
df_i = df.filter(['plate', 'chemical', 'processed_relative_fp'], axis=1)
df_i = df.pivot(columns='plate', values='processed_relative_fp', index='chemical')
df_i.index.name = None
df_i.columns.name = ''
plates = {exps: {"processed_relative_fp": series}
for exps, series in df_i.to_dict("series").items()}
chem_order = list(df['chemical'].unique())
for chem in chem_order:
if chem != 'empty':
pool = []
sample_l = []
for q in [self.reference_query, self.sample_query]:
sample = pd.Series([sample for sample in plates])
sample = sample[sample.str.contains(q)]
for s in sample:
sample_l.append(s)
record = plates[s]
if record['processed_relative_fp'] is not None:
rel_fp = record['processed_relative_fp']
lot = s.split("_")[1]
cell = s.split('_')[2]
rep = s.split("_")[4]
val = rel_fp[chem]
pool.append({"chems": chem, "lot": lot, "cell": cell, "replicate": rep,
"key": chem + "::" + lot + "::" + rep + "::" + cell, "value": val})
pool = pd.DataFrame(pool)
massage = []
averages = []
sort_lots = list(pool['lot'].unique())
for lot in sort_lots:
value = list(pool[pool.lot.eq(lot)]['value'].dropna())
averages.append(np.mean(np.array(value)))
massage.append(value)
averages = np.array(averages)
min_ = np.nanmin(averages)
max_ = np.nanmax(averages)
pos_min_ = np.where(averages == min_)[0][0]
pos_max_ = np.where(averages == max_)[0][0]
perc_diff = (max_ - min_) * 100
fvalue_st_t, pvalue_st_t = stats.ttest_ind(*massage, equal_var=False)
st_pval_arr.append(pvalue_st_t)
st_pval_array = np.asarray(st_pval_arr)
mask = np.isfinite(st_pval_array)
st_t_t_pval_adj = np.empty(st_pval_array.shape)
st_t_t_pval_adj.fill(np.nan)
rej_st_t, st_t_t_pval_adj[mask], _, _ = sm.stats.multipletests(st_pval_array[mask], method='fdr_bh')
for v in st_t_t_pval_adj:
float(v)
pval_store_st_t_val_adj[chem] = v
for z in rej_st_t:
float(z)
if len(massage[0]) >= 3:
test_stat_norm_A, pvalue_norm_A = stats.shapiro(np.array(massage[0]))
if pvalue_norm_A < 0.05:
pval_store_norm_A[chem] = 'No'
else:
pval_store_norm_A[chem] = 'Yes'
else:
pval_store_norm_A[chem] = 'Not enough data'
if len(massage[1]) >= 3:
test_stat_norm_B, pvalue_norm_B = stats.shapiro(np.array(massage[1]))
if pvalue_norm_B < 0.05:
pval_store_norm_B[chem] = 'No'
else:
pval_store_norm_B[chem] = 'Yes'
else:
pval_store_norm_B[chem] = 'not enough data'
var_stat_A = stats.variation(np.array(massage[0]))
var_stat_B = stats.variation(np.array(massage[1]))
stat_lev_bart, pvalue_lev_bart = stats.levene(*massage, center='mean')
pval_store_st_t[chem] = pvalue_st_t
pval_store_st_t_val[chem] = pvalue_st_t
var_store_A[chem] = var_stat_A
var_store_B[chem] = var_stat_B
pval_store_lev_bart[chem] = pvalue_lev_bart
if pvalue_st_t < 0.05:
pval_store_st_t[chem] = 'diff'
else:
pval_store_st_t[chem] = 'same'
if v > 0.05:
pval_store_st_t_dec[chem] = 'same'
if v < 0.05 and perc_diff > 0.0:
pval_store_st_t_dec[chem] = 'diff'
counter_sig_st_t += 1
else:
counter_nosig += 1
decisions.append(pval_store_st_t)
decisions.append(pval_store_st_t_dec)
decisions.append(pval_store_st_t_val)
decisions.append(pval_store_st_t_val_adj)
decisions.append(pval_store_norm_A)
decisions.append(pval_store_norm_B)
decisions.append(var_store_A)
decisions.append(var_store_B)
decisions.append(pval_store_lev_bart)
decisions = pd.DataFrame(decisions)
decisions = decisions.T
decisions.columns = [
f'Welchs t-test result for lot {self.reference_query} v lot {self.sample_query}',
'Welchs t-test considering Multi-test Correction',
'Welchs t-test pvalue unadjusted',
'Welchs t-test pvalue adjusted',
f'are lot {self.reference_query} chemicals normally distributed?',
f'are lot {self.sample_query} chemicals normally distributed?',
f'lot {self.reference_query} variance',
f'lot {self.sample_query} variance',
'levenes statistic']
decisions_before_filter = decisions.copy()
decisions_before_filter.reset_index(inplace=True)
decisions_before_filter.rename(columns={'index': 'chemicals'}, inplace=True)
decisions = decisions.loc[(decisions['Welchs t-test pvalue adjusted'] < self.alpha)]
decisions.reset_index(inplace=True)
decisions.rename(columns={'index': 'chemicals'}, inplace=True)
decisions.sort_values(by=['Welchs t-test pvalue adjusted'], inplace=True)
decisions_before_filter = decisions_before_filter.filter(items=['chemicals', 'Welchs t-test pvalue adjusted'])
decisions_before_filter = decisions_before_filter.sort_values(by=['Welchs t-test pvalue adjusted'])
results_fishers = self.fishers_method(decisions_before_filter['Welchs t-test pvalue adjusted'].tolist())
results_fishers_df = pd.DataFrame(results_fishers,
index=['decision',
'combined z-score',
'combined p-value',
'meanFDR'])
results_fishers_df = results_fishers_df.T
results_fishers_df['k'] = len(decisions_before_filter.dropna())
results_fishers_df.insert(loc=0, column='comparison', value=(str(self.reference_query + '-' + 'vs'
+ '-' + self.sample_query)))
return decisions, results_fishers_df
See all the information above
The error is telling you that filtered_df is NOT a pandas.DataFrame.
It's a tuple. In fact your TtestStrategy's transform method returns the tuple:
return decisions, results_fishers_df
If you change your code to:
def execute(self, output_from_previous: Any = None, **kwargs) -> Any:
_ignored, self.filtered_df = self.strategy.transform(key=self.key_input,
df=self.df_to_filter)
self.state.query_reports[self.key_output] = self.filtered_df
super().execute(output_from_previous=output_from_previous, **kwargs)
then you shouldn't see that AttributeError anymore.

Python method not recognized

I am learning python so I have worked on this one thing a long time. I still can't find the answer.
Interpreter says there is no method called _set_icon()
code:
import pyodbc as db
import pandas as pd
import Globals
class BatchNodeData(object):
"""support batch node of the tree. Contains what it needs to do that"""
def __init__(self):
pass
def _set_icon():
sql_conn = db.connect(Globals.SQL_CONN_STRING)
b_query = " \
SELECT top 1 * \
FROM dbo.ETLBatchRun a \
Where b.BatchID = " + str(batchid) + \
"Order by a.StatusDT desc"
df_icon = pd.read_sql(b_query, sql_conn)
if not df_icon.empty:
self.last_status = df_icon['StatusID'].iloc[0]
def _get_icon_index():
switcher = {
1: 2,
2: 2,
3: 3,
4: 4
}
switcher_selected = {
1: 7,
2: 7,
3: 8,
4: 8
}
if selected:
return switcher_selected.get(statusid, 0) # default 0 (yellow bar)
else:
return switcher.get(statusid, 0) # default 0 (yellow bar)
def __init__(self, batchid):
self.batch_id = None
self.batch_name = None
self.critical = None
self.node_icon_index = None
self.last_status = None
self.selected = False
self.running = False
sql_conn = db.connect(Globals.SQL_CONN_STRING)
b_query = " \
select b.BatchID \
, b.BatchName \
, c.AttributeValue as Critical \
, noRun.AttributeValue as noRun \
from dbo.ETLBatch b (nolock) \
left join dbo.etlbatchattribute (nolock) c \
on c.batchid = b.batchid \
and c.AttributeName = 'Critical' \
and c.AttributeValue = '1' \
left join dbo.etlbatchattribute (nolock) noRun \
on noRun.batchid = b.batchid \
and noRun.AttributeName = 'NotRunnableInETLMonitor' \
and noRun.AttributeValue = '1' \
Where b.BatchID = " + str(batchid)
df_batch = pd.read_sql(b_query, sql_conn)
for index, row in df_batch.iterrows():
batch_id = row['BatchID']
batch_name = row['BatchName']
critical = row['Critical']
_set_icon()
self.node_icon_index = _get_icon_index()
Since you've declared _set_icon() as a method bounded by the class, you should be able to call it as:
BatchNodeData._set_icon()

PySpark where clause condition on condition

I'm getting Syntax error on below query:
df_result = df_checkout.join(df_checkin,
(
(df_checkout.product == df_checkin.product)
(df_checkout.host == df_checkin.host)
),
how = 'full_outer').where(df_checkout.rank =
F.when(((df_checkout.rank = df_checkin.rank) and (F.unix_timestamp(df_checkout.checkout_date, 'MM/dd/YYYY HH:MI:SS') <= F.unix_timestamp(df_checkin.checkin_date, 'MM/dd/YYYY HH:MI:SS'))), (df_checkin.rank - 1)).when(((df_checkout.rank = df_checkin.rank) and (F.unix_timestamp(df_checkout.checkout_date, 'MM/dd/YYYY HH:MI:SS') >= F.unix_timestamp(df_checkin.checkin_date, 'MM/dd/YYYY HH:MI:SS'))), df_checkin.rank).otherwise(None)
)
What is the error I'm having ?
you have a = instead of ==:
(df_checkout.rank = df_checkin.rank)
should be
(df_checkout.rank == df_checkin.rank)

Spark SQL for-loop error 'bool' attribute has no attribute 'alias'

I am trying to create new columns in a Spark SQL dataframe that compare two columns within the dataframe, and return True if they are equal and False otherwise. I have to do this for a datset with thousands of columns. To be a sample problem, I've included all of my code here. However, the important problem comes in the second for loop at the end of the code bloc.
from pyspark.sql import SQLContext
from pyspark.sql.types import *
data = sc.parallelize([[1, None, 'BND'], [2, None, 'RMP'], [3, None, 'SWP'], [4, None, "IRS"], [5, None, "SWP"], [6, None, "IRS"]])
match = sc.parallelize([[1, 2, 100], [3, 5, 101], [4, 6, 102]])
trade_schema_string = 'trade_id,match_id,product'
trade_fields = [StructField(field_name, StringType(), True) for field_name in trade_schema_string.split(',')]
trade_fields[0].dataType = IntegerType()
trade_fields[1].dataType = IntegerType()
trade_schema = StructType(trade_fields)
match_schema_string = "pri_netting_id,sec_netting_id,match_id"
match_fields = [StructField(field_name, IntegerType(), True) for field_name in match_schema_string.split(',')]
match_schema = StructType(match_fields)
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(data, trade_schema)
odf = sqlContext.createDataFrame(match, match_schema)
df.registerTempTable("trade")
odf.registerTempTable("match")
# Get match_ids so you can match up front office and back office records
# Change column names for fo and bo dataframes so that they say "bo_product" and "fo_product", etc.
fo = sqlContext.sql("SELECT t.trade_id,t.product,m.match_id FROM trade t INNER JOIN match m WHERE t.trade_id = m.pri_netting_id")
bo = sqlContext.sql("SELECT t.trade_id,t.product,m.match_id FROM trade t INNER JOIN match m WHERE t.trade_id = m.sec_netting_id")
col_names = fo.columns
for x in range(0, len(col_names)):
col_name = col_names[x]
fo = fo.withColumnRenamed(col_name, "fo_" + col_name)
bo = bo.withColumnRenamed(col_name, "bo_" + col_name)
fo.registerTempTable("front_office")
bo.registerTempTable("back_office")
fobo = sqlContext.sql("SELECT f.fo_trade_id,f.fo_product,b.bo_trade_id,b.bo_product FROM front_office f INNER JOIN back_office b WHERE f.fo_match_id = b.bo_match_id")
fobo = fobo.repartion(5)
# How to create diff columns
num_cols = len(fobo.columns)
fobo_names = fobo.columns
first = fobo.first()
for x in range(0, num_cols / 2):
new_name = "\'diff_" + fobo_names[x][3:] + "\'"
old_column_fo = "fobo." + fobo_names[x]
old_column_bo = "fobo." + fobo_names[x + (num_cols / 2)]
fobo = fobo.withColumn(new_name, old_column_fo == old_column_bo)
The error I get is:
Traceback (most recent call last):
File "", line 8, in
File "/opt/cloudera/parcels/CDH-5.4.0-1.cdh5.4.0.p0.27/lib/spark/python/pyspark/sql/dataframe.py", line 695, in withColumn
return self.select('*', col.alias(colName))
AttributeError: 'bool' object has no attribute 'alias'
So, the strange thing is that if I execute the following by hand:
fobo = fobo.withColumn("diff_product", fobo.fo_product == fobo.bo_product)
and
fobo = fobo.withColumn("diff_trade_id", fobo.fo_trade_id == fobo.bo_trade_id)
The whole thing works perfectly. However, this isn't practical for my true use case, which has many columns.
old_column_fo = "fobo." + fobo_names[x]
old_column_bo = "fobo." + fobo_names[x + (num_cols / 2)]
fobo = fobo.withColumn(new_name, old_column_fo == old_column_bo)
old_column_fo and old_column_bo will be strings that merely look like the attribute names you're trying to access, but they won't be the actual attributes. Try using getattr instead.
old_column_fo = getattr(fobo, fobo_names[x])
old_column_bo = getattr(fobo, fobo_names[x + (num_cols / 2)])
fobo = fobo.withColumn(new_name, old_column_fo == old_column_bo)

Categories

Resources