OutOfMemory while using Jupyter notebook with spark - python

I am currently using IBM Data Scientist Workbench with Jupyter notebooks and Spark.
I am trying to read several CSV files to a DF and then applying some transformations to it in order to create a final dataframe with merged data from the different CSV files, but for some reason I am getting this error:
Caused by: java.lang.OutOfMemoryError: GC overhead limit exceeded
at java.util.Arrays.copyOf(Arrays.java:2367)
at java.lang.AbstractStringBuilder.expandCapacity(AbstractStringBuilder.java:130)
at java.lang.AbstractStringBuilder.ensureCapacityInternal(AbstractStringBuilder.java:114)
at java.lang.AbstractStringBuilder.append(AbstractStringBuilder.java:415)
at java.lang.StringBuilder.append(StringBuilder.java:132)
The code I am using is as follows:
i=0
count = 0
var_name = []
schema = StructType([])
df1 = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_ocurrences = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_count = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_merged = sqlContext.createDataFrame(sc.emptyRDD(), schema)
df1_complete = sqlContext.createDataFrame(sc.emptyRDD(), schema)
FINAL = sqlContext.createDataFrame(sc.emptyRDD(), schema)
for file in os.listdir('/resources/data/test_variables/'):
df1 = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/resources/data/test_variables/"+file)
#SKIP SERIES WITH ONLY 0s
count = df1.groupBy().sum("Bit_value")
if count.select("sum(Bit_value)").collect()[0][0] == 0:
continue
#
i+=1
# AGGREGATION
df1 = df1.withColumn("Interval", ((df1.Timestamp.cast("long") / 1).cast("long") * 1).cast("timestamp"))
# COUNT 1s
df1_ocurrences = df1.groupBy("Interval").sum("Bit_value").sort("Interval")
df1_ocurrences = df1_ocurrences.withColumnRenamed("sum(Bit_value)", "Sum_df1")
# COUNT TOTAL
df1_count = df1.groupBy("Interval").count().sort("Interval")
df1_count = df1_count.withColumnRenamed("count", "Total_df1")
# MERGING
df1_merged = df1_ocurrences.join(df1_count, ["Interval"]).sort("Interval")
var_name = file.split(".")
df1_complete = df1_merged.withColumn(var_name[0], df1_merged.Sum_df1 / df1_merged.Total_df1)
df1_complete = df1_complete.drop('Sum_df1')
df1_complete = df1_complete.drop('Total_df1')
#FINAL DATAFRAME
if i == 1:
FINAL = df1_complete
else:
FINAL = FINAL.join(df1_complete, ["Interval"]).sort("Interval")
Any advice on this? Maybe I am not writing the most efficient code but I am new to Spark.

Too much time spent on GC and too little memory is freed up: https://developer.ibm.com/hadoop/2016/02/16/beginners-guide-apache-spark-troubleshooting/
In addition to recomendation in above article what worked for me in jyputer is this:
spark = SparkSession.builder \
.appName("GBT Model") \
.config("spark.executor.memory", "2000mb") \
.master("local[*]") \
.config("spark.executor.cores", "4") \
.config("spark.yarn.executor.memoryOverhead",200) \
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
.config("spark.default.parallelism", "4") \
.getOrCreate()
Note spark.yarn.executor.memoryOverhead is set to 10% of executor memory.

Related

TypeError: sequence item 0: expected str instance, float found in Pyhton

Below is the Data set I was using(syn-retweet-done.csv in my code). And The above error did come off
,Unnamed: 0,created_at,tweet,category
0,0,2021-07-29 02:40:00,People Gather in numbers,Other
1,0,2021-07-29 02:40:00,No real sign of safety,Other
2,1,2021-07-27 10:40:00,President is On fire,Politics
3,1,2021-07-27 10:40:00,Election is to be held next month,Politics
Below is the codebase I worked on. It would be very helfil if someone can figure out the issue which is pointing aggregated()
def aggregated():
tweets = pd.read_csv(r'syn-retweet-done.csv')
df = pd.DataFrame(tweets, columns=['created_at', 'tweet','category'])
out = df.groupby(['created_at', 'category'], sort=False, as_index=False)['tweet'] \
.apply(lambda x: ' '.join(x))[df.columns]
# print(out)
return out
def saveFile():
df = pd.read_csv('test_1.csv');
categories = df['category'].unique()
for category in categories:
df[df['category'] == category].to_csv(category + '.csv')
# Driver code
if __name__ == '__main__':
print(aggregated())
aggregated().to_csv(r'test_1.csv',index = True, header=True)
saveFile()

Deleting empty columns with a few columns between data

I'm fetching data from a Google sheet:
values1 = pd.DataFrame(values)
aux = values1.head(1)
values1.drop(index={0}, inplace=True)
senal1 = (values1[2] == "SEÑAL")
senal = values1[senal1]
senal.dropna(axis=1, inplace=True)
print(senal)
This is my result after running the code:

How do I speed up the for loop when using pandas?

Hastily, I once wrote the code for work. It was assumed that it would only need to be run once. But now you have to run it often. The source data unfortunately can not provide because they conferenceline. I am interested in the question, which parts of the code need to be corrected in order to get performance? The current version runs for about 2 hours, running about 25K iterations.
dfs = []
for x in tqdm_notebook(range(len(group_furn))):
good_id = group_furn.iloc[x, 0]
name_promo = group_furn.iloc[x, 1]
unique_stores = set(stores.loc[stores.store_type_id != 4]['store_id'].unique()) - \
set(promo_list.loc[(promo_list['good_id'] == good_id) & \
(promo_list['name_promo_mech'] == name_promo)].store_id.unique())
bu_stores = list(unique_stores.intersection(set(bu_furn.store_id.unique())))
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id != 4)].main_store_id.unique()
df = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.store_id.isin(main_stores))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df = pd.merge(df, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df)
main_stores = bu_store.loc[(bu_store.store_id.isin(bu_stores)) & (bu_store.store_type_id == 4)].main_store_id.unique()
owners = bu_store.loc[bu_store.main_store_id.isin(main_stores)].main_owner_id.unique()
df_2 = promo_list.loc[(promo_list.good_id == good_id) & (promo_list.name_promo_mech == name_promo) &
(promo_list.owner_id.isin(owners))]
bu = bu_store.loc[bu_store.main_store_id.isin(main_stores)]
df_2 = pd.merge(df_2, bu, how='inner', left_on='store_id', right_on='main_store_id')
dfs.append(df_2)

How to process data of small chunks in pandas?

I am trying to process the data after using chunksize parameter. I am getting an error as it the iterator is not converting into a dataframe.
pdvs = dbu.readFromSQL("SELECT GOOGLE.* \
,creation_time \
FROM [dbo].[DM_SOURCE_GOOGLE_DETAILS] AS GOOGLE\
JOIN \
( \
SELECT place_id \
,max(datetime) AS datetime \
, min(datetime) AS creation_time \
FROM [dbo].[DM_SOURCE_GOOGLE_DETAILS] \
GROUP BY place_id \
)AS date_updated \
ON GOOGLE.datetime = date_updated.datetime \
AND GOOGLE.place_id = date_updated.place_id", chunksize = 5
)
combined = pd.DataFrame(pdvs)
# get address
combined['valid'] = True
But the transformation is not working. I am getting the following error
'generator' object does not support item assignment

Reducing RAM consumption of Python dict

I have a python script that process several files of some gigabytes. With the following code I show below, I store some data into a list, which is stored into a dictionary snp_dict. The RAM consumption is huge. Looking at my code, could you suggest some ways to reduce RAM consumption, if any?
def extractAF(files_vcf):
z=0
snp_dict=dict()
for infile_name in sorted(files_vcf):
print ' * ' + infile_name
###single files
vcf_reader = vcf.Reader(open(infile_name, 'r'))
for record in vcf_reader:
snp_position='_'.join([record.CHROM, str(record.POS)])
ref_F = float(record.INFO['DP4'][0])
ref_R = float(record.INFO['DP4'][1])
alt_F = float(record.INFO['DP4'][2])
alt_R = float(record.INFO['DP4'][3])
AF = (alt_F+alt_R)/(alt_F+alt_R+ref_F+ref_R)
if not snp_position in snp_dict:
snp_dict[snp_position]=list((0) for _ in range(len(files_vcf)))
snp_dict[snp_position][z] = round(AF, 3) #record.INFO['DP4']
z+=1
return snp_dict
I finally adopted the following implementation with MySQL:
for infile_name in sorted(files_vcf):
print infile_name
###single files
vcf_reader = vcf.Reader(open(infile_name, 'r'))
for record in vcf_reader:
snp_position='_'.join([record.CHROM, str(record.POS)])
ref_F = float(record.INFO['DP4'][0])
ref_R = float(record.INFO['DP4'][1])
alt_F = float(record.INFO['DP4'][2])
alt_R = float(record.INFO['DP4'][3])
AF = (alt_F+alt_R)/(alt_F+alt_R+ref_F+ref_R)
if not snp_position in snp_dict:
sql_insert_table = "INSERT INTO snps VALUES ('" + snp_position + "'," + ",".join(list(('0') for _ in range(len(files_vcf)))) + ")"
cursor = db1.cursor()
cursor.execute(sql_insert_table)
db1.commit()
snp_dict.append(snp_position)
sql_update = "UPDATE snps SET " + str(z) + "g=" + str(AF) + " WHERE snp_pos='" + snp_position + "'";
cursor = db1.cursor()
cursor.execute(sql_update)
db1.commit()
z+=1
return snp_dict
For this sort of thing, you are probably better off using another data structure. A pandas DataFrame would work well in your situation.
The simplest solution would be to use an existing library, rather than writing your own parser. vcfnp can read vcf files into a format that is easily convertible to a pandas DataFrame. Something like this should work:
import pandas as pd
def extractAF(files_vcf):
dfs = []
for fname in sorted(files_vcf):
vars = vcfnp.variants(fname, fields=['CHROM', 'POS', 'DP4'])
snp_pos = np.char.add(np.char.add(vars.CHROM, '_'), record.POS.astype('S'))
dp4 = vars.DP4.astype('float')
AF = dp4[2:].sum(axis=0)/dp4.sum(axis=0)
dfs.append(pd.DataFrame(AF, index=snp_pos, columns=[fname]).T)
return pd.concat(dfs).fillna(0.0)
If you absolutely must use PyVCF, it will be slower, but hopefully this will at least be faster than your existing implementation, and should produce the same result as the above code:
def extractAF(files_vcf):
files_vcf = sorted(files_vcf)
dfs = []
for fname in files_vcf:
print ' * ' + fname
vcf_reader = vcf.Reader(open(fname, 'r'))
vars = ((rec.CHROM, rec.POS) + tuple(rec.INFO['DP4']) for rec in vcf_reader)
df = pd.DataFrame(vars, columns=['CHROMS', 'POS', 'ref_F', 'ref_R', 'alt_F', 'alt_R'])
df['snp_position'] = df['CHROMS'] + '_' + df['POS'].astype('S')
df_alt = df.loc[:, ('alt_F', 'alt_R')]
df_dp4 = df.loc[:, ('alt_F', 'alt_R', 'ref_F', 'ref_R')]
df[fname] = df_alt.sum(axis=1)/df_dp4.sum(axis=1)
df = df.set_index('snp_position', drop=True).loc[:, fname:fname].T
dfs.append(df)
return pd.concat(dfs).fillna(0.0)
Now lets say you wanted to read a particular snp_position, say contained in a variable snp_pos, that may or may not be there (from your comment), you wouldn't actually have to change anything:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
linea_di_AF = all_vcf[snp_pos]
The result will be slightly different, though. It will be a pandas Series, which is like an array but can also be accessed like a dictionary:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
linea_di_AF = all_vcf[snp_pos]
f_di_AF = linea_di_AF[files_vcf[0]]
This allows you to access a particular file/snp_pos pair directly:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
f_di_AF = linea_di_AF[snp_pos][files_vcf[0]]
Or, better yet:
all_vcf = extractAF(files_vcf)
if snp_pos in all_vcf:
f_di_AF = linea_di_AF.loc[files_vcf[0], snp_pos]
Or you can get all snp_pos values for a given file:
all_vcf = extractAF(files_vcf)
fpos = linea_di_AF.loc[fname]

Categories

Resources