I am trying to anonymize data in CSV, however, I only want to do this for cells that are not empty. At present, my program adds anonymized data to all cells with the given row.
How can I skip empty the empty cells? Below is my program
import csv
from faker import Faker
from collections import defaultdict
def anonymize():
"Anonymizes the given original data to anonymized form"
faker = Faker()
names = defaultdict(faker.name)
emails = defaultdict(faker.email)
with open(filename, "r") as f:
with open(f"{filename}-anonymized_data.csv", "w") as o:
reader = csv.DictReader(f)
writer = csv.DictWriter(o, reader.fieldnames)
writer.writeheader()
for row in reader:
row["adult_First_Name"] = names[
row["adult_First_Name"]
]
row["child_First_Name"] = names[
row["child_First_Name"]
]
row["Adult - EMAIL ADDRESS"] = emails[row["Adult - EMAIL ADDRESS"]]
row["Parent - EMAIL ADDRESS"] = emails[row["Parent - EMAIL ADDRESS"]]
writer.writerow(row)
if __name__ == "__main__":
anonymize()
You could test each field before applying the fake value. A simpler approach would be to store the fields that need to be changed in a fields list along with which faker function to apply if needed:
import csv
from faker import Faker
def anonymize():
"Anonymizes the given original data to anonymized form"
faker = Faker()
fields = [
("adult_First_Name", faker.name),
("child_First_Name", faker.name),
("Adult - EMAIL ADDRESS", faker.email),
("Parent - EMAIL ADDRESS", faker.email),
]
with open(filename, "r") as f:
with open(f"{filename}-anonymized_data.csv", "w", newline="") as o:
reader = csv.DictReader(f)
writer = csv.DictWriter(o, reader.fieldnames)
writer.writeheader()
for row in reader:
for field, fake in fields:
if row[field]:
row[field] = fake()
writer.writerow(row)
if __name__ == "__main__":
anonymize()
Adding newline='' would stop extra blank lines in the output.
Related
I am programming a discord bot that lets users send an embed to a channel. The embed is split into multiple parts, which I want to safe to a CSV file because I want to add features that require the data to be saved.
The problem is, that when a user executes the command, the first line in the CSV gets overridden with the new content of the command/embed.
I tried to use a different mode to save to the next line. 'w' for writing is the one I am experiencing the problem with. The mode 'a' almost works, but it also adds the field names every time.
The CSV Code:
with open('homework.csv', 'a', newline='') as file:
fieldnames = ['user', 'fach', 'aufgabe', 'abgabedatum']
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerow({'user': str(message.author), 'fach': subject, 'aufgabe': task, 'abgabedatum': date})
The CSV Output using the mode a
user,fach,aufgabe,abgabedatum
user,fach,aufgabe,abgabedatum
Akorian#0187,test,ja mddoiddn ,01.03.2021
user,fach,aufgabe,abgabedatum
Akorian#0187,testddd,ja mddoiddn ,01.03.2021
Try this:
import csv
import os
if os.path.isfile('homework.csv'):
with open('homework.csv', 'a', newline='') as file:
fieldnames = ['user', 'fach', 'aufgabe', 'abgabedatum']
w = csv.DictWriter(file, fieldnames=fieldnames)
w.writerow({'user': str(message.author), 'fach': subject, 'aufgabe': task, 'abgabedatum': date})
else:
with open('homework.csv', 'w', newline='') as file:
fieldnames = ['user', 'fach', 'aufgabe', 'abgabedatum']
w = csv.DictWriter(file, fieldnames=fieldnames)
w.writeheader()
w.writerow({'user': str(message.author), 'fach': subject, 'aufgabe': task, 'abgabedatum': date})
I want to get all the data of a lead object
and I did this script to obtain the data of a lead
and i only get an amount of 2000.
What do I have to do to get all the amount in
Salesforce of an object by pyhon?
from simple_salesforce import Salesforce
from datetime import datetime
import csv
import os
import json
import account
SALESFORCE_USERNAME = '123'
PASSWORD = '123'
SECURITY_TOKEN = '123'
def main():
# Authentication settings
sf = Salesforce(username=SALESFORCE_USERNAME,
password=PASSWORD,
security_token=SECURITY_TOKEN)
# Lead Column setting to be acquired
columns = [
"Id",
"Email",
"Company",
"Address",
"Phone",
"FirstName",
"LastName",
"CreatedDate",
]
sosl = 'SELECT {0[0]}, {0[1]}, {0[2]}, {0[3]}, {0[4]}, {0[5]}, {0[6]} , {0[7]} , {0[8]} FROM Lead '.format(
columns)
# Data acquisition with SOSL
data = sf.query(sosl)
# Delete CSV file if it exists
output_csv = 'output.csv'
if os.path.exists(output_csv):
os.remove(output_csv)
# Write to CSV file
for k, v in data.items():
if type(v) is list:
with open(output_csv, 'w', newline="") as f:
writer = csv.DictWriter(f, fieldnames=columns)
writer.writeheader()
for d in v:
data = json.loads(json.dumps(d))
del data['attributes']
d = datetime.strptime(
data['CreatedDate'], '%Y-%m-%dT%H:%M:%S.%f%z')
data['CreatedDate'] = d.strftime('%Y-%m-%d %H:%M:%S')
writer.writerow(data)
if __name__ == '__main__':
main()
If anyone knows, please let me know.
You can obtain all responsive records via the query_all() or query_all_iter() methods, which are documented under Queries in the simple_salesforce documentation.
Note that the query you are running is SOQL, not SOSL. SOSL is for full-text search.
I have created a function that fetches price, rating, etc after it hits an API:
def is_priced(business_id):
try:
priced_ind = get_business(API_KEY, business_id)
priced_ind1 = priced_ind['price']
except:
priced_ind1 = 'None'
return priced_ind1
priced_ind = is_priced(b_id)
print(priced_ind)
Similar for rating
def is_rated(business_id):
try:
rated_ind = get_business(API_KEY, business_id)
rated_ind1 = rated_ind['rating']
except:
rated_ind1 = 'None'
return rated_ind1
However, I want my function to loop through the business names I have in my CSV file and catch all this data and export it to a new csv file with these two parameters beside the names of the business.
The CSV file has info on the name of the business along with its address,city,state,zip and country
Eg:
Name address city state zip country
XYZ(The) 5* WE 223899th St. New York NY 19921 US
My output:
Querying https://api.xyz.com/v3/businesses/matches ...
True
Querying https://api.xyz.com/v3/businesses/matches ...
4.0
Querying https://api.xyz.com/v3/businesses/matches ...
$$
Querying https://api.xyz.com/v3/businesses/matches ...
Querying https://api.xyz.com/v3/businesses/matches ...
The real issue is my output only returns business id in the csv. and the rating etc as u see is just returned in the console. how do I set a loop such that it returns for all the businesses the info i desire into a single CSV?
The csv module is useful for this sort of thing e.g.
import csv
with open('f.csv', 'r') as csvfile:
reader = csv.reader(csvfile, delimiter=',', quotechar='"')
with open('tmp.csv', 'w') as output:
writer = csv.writer(output)
for row in reader:
business_id = row[0]
row.append(get_price_index(business_id))
row.append(get_rate_index(business_id))
writer.writerow(row)
You can read the business names from the CSV file, iterate over them using a for loop, hit the API and store the results, and write to a new CSV file.
import csv
data = []
with open('businesses.csv') as fp:
# skip header line
header = next(fp)
reader = csv.reader(fp)
for row in reader:
b_name = reader[0]
# not sure how you get the business ID:
b_id = get_business_id(b_name)
p = is_priced(b_id)
r = is_rated(b_id)
out.append((b_name, p, r))
# write out the results
with open('business_data.csv', 'w') as fp:
writer = csv.writer(fp)
writer.writerow(['name', 'price', 'rating'])
for row in data:
writer.writerow(row)
You can do this easily using pandas:
import pandas as pd
csv = pd.read_csv('your_csv.csv', usecols=['business_name']) # since you only need the name
# you'll receive business_name in your functions
csv = csv.apply(is_priced, axis=1)
csv = csv.apply(is_rated, axis=1)
csv.to_csv('result.csv', index=False)
All you have to do in your functions is:
def is_priced(row):
business_name = row['business_name']
business_id = ??
...
I have some CSV files the format is ID, timestamp, customerID, email, etc. I want fill the Email column to empty and other columns keep it same. I'm using Python 2.7 and is restricted to use Pandas. Can anyone help me?
Thank you all for the help
My code below, but this is not that efficiency and reliable also if some raw have the strange character it will be broken the logic.
new_columns = [
'\xef\xbb\xbfID', 'timestamp', 'CustomerID', 'Email', 'CountryCode', 'LifeCycle', 'Package', 'Paystatus', 'NoUsageEver', 'NoUsage', 'VeryLowUsage',
'LowUsage', 'NormalUsage', 'HighUsage', 'VeryHighUsage', 'LastStartDate', 'NPS 0-8', 'NPS Score (Q2)', 'Gender(Q38)', 'DOB(Q39)',
'Viaplay users(Q3)', 'Primary Content (Q42)', 'Primary platform(Q4)', 'Detractor (strong) (Q5)', 'Detractor open text(Q22)',
'Contact Detractor (Q21)', 'Contact Detractor (Q20)', 'Contact Detractor (Q43)', 'Contact Detractor(Q26)', 'Contact Detractor(Q27)',
'Contact Detractor(Q44)', 'Improvement areas(Q7)', 'Improvement areas (Q40)', 'D2 More value for money(Q45)', 'D2 Sport content(Q8)',
'D2 Series content(Q9)', 'D2 Film content(Q10)', 'D2 Children content(Q11)', 'D2 Easy to start and use(Q12)',
'D2 Technical and quality(Q13)',
'D2 Platforms(Q14)', 'D2 Service and support(Q15)', 'D3 Sport content(Q16)', 'Missing Sport Content (Q41)',
'D3 Series and films content(Q17)',
'NPS 9-10', 'Recommendation drivers(Q28)', 'R2 Sport content(Q29)', 'R2 Series content(Q30)', 'R2 Film content(Q31)',
'R2 Children content(Q32)', 'R2 Easy to start and use(Q33)', 'R2 Technical and quality(Q34)', 'R2 Platforms(Q35)',
'R2 Service and support(Q36)',
'Promoter open text(Q37)'
]
with open(file_path, 'r') as infile:
print file_path
reader = csv.reader(infile, delimiter=";")
first_row = next(reader)
for row in reader:
output_row = []
for column_name in new_columns:
ind = first_row.index(column_name)
data = row[ind]
if ind == first_row.index('Email'):
data = ''
output_row.append(data)
writer.writerow(output_row)
File format before
File format after
So you are reordering the columns and clearing the email column:
with open(file_path, 'r') as infile:
print file_path
reader = csv.reader(infile, delimiter=";")
first_row = next(reader)
for row in reader:
output_row = []
for column_name in new_columns:
ind = first_row.index(column_name)
data = row[ind]
if ind == first_row.index('Email'):
data = ''
output_row.append(data)
writer.writerow(output_row)
I would suggest moving the searches first_row.index(column_name) and first_row.index('Email') out of the per row processing.
with open(file_path, 'r') as infile:
print file_path
reader = csv.reader(infile, delimiter=";")
first_row = next(reader)
email = first_row.index('Email')
indexes = []
for column_name in new_columns:
ind = first_row.index(column_name)
indexes.append(ind)
for row in reader:
output_row = []
for ind in indexes:
data = row[ind]
if ind == email:
data = ''
output_row.append(data)
writer.writerow(output_row)
email is the index of the email column in the input. indexes is a list of the indexes of the columns in the input in the order specified by the new_columns.
Untested.
You could use dict versions of the csv reader/writer to get the column by name. Something like this:
import csv
with open('./test.csv', 'r') as infile:
reader = csv.DictReader(infile, delimiter=";")
with open('./output.csv', 'w') as outfile:
writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames)
writer.writeheader()
for row in reader:
row['Email'] = ''
writer.writerow(row)
So I am collecting data and this data is saved into csv files, however for presentation purposes I want to reorder the columns in each respective csv file based on it's related "order".
I was using this question (write CSV columns out in a different order in Python) as a guide but I'm not sure why I'm getting the error
writeindices = [name2index[name] for name in writenames]
KeyError: % Processor Time
when I run it. Note this error doesn't seem to be limited to just the string % Processor Time'.
Where am I going wrong?
Here is my code:
CPU_order=["%"+" Processor Time", "%"+" User Time", "Other"]
Memory_order=["Available Bytes", "Pages/sec", "Pages Output/sec", "Pages Input/sec", "Page Faults/sec"]
def reorder_csv(path,title,input_file):
if title == 'CPU':
order=CPU_order
elif title == 'Memory':
order=Memory_order
output_file=path+'/'+title+'_reorder'+'.csv'
writenames = order
reader = csv.reader(input_file)
writer = csv.writer(open(output_file, 'wb'))
readnames = reader.next()
name2index = dict((name, index) for index, name in enumerate(readnames))
writeindices = [name2index[name] for name in writenames]
reorderfunc = operator.itemgetter(*writeindices)
writer.writerow(writenames)
for row in reader:
writer.writerow(reorderfunc(row))
Here is a sample of what the input CSV file looks like:
,CPU\% User Time,CPU\% Processor Time,CPU\Other
05/23/2016 06:01:51.552,0,0,0
05/23/2016 06:02:01.567,0.038940741537158409,0.62259056657940626,0.077882481554869071
05/23/2016 06:02:11.566,0.03900149141703179,0.77956981074955856,0
05/23/2016 06:02:21.566,0,0,0
05/23/2016 06:02:31.566,0,1.1695867249963632,0
Your code works. It is your data which does not have a column named "% Processor Time". Here is a sample data I use:
Other,% User Time,% Processor Time
o1,u1,p1
o2,u2,p2
And here is the code which I call:
reorder_csv('.', 'CPU', open('data.csv'))
With these settings, everything works fine. Please check your data.
Update
Now that I see your data, it looks like your have column names such as "CPU\% Processor Time" and want to translate it to "% Processor Time" before writing out. All you need to do is creating your name2index this way:
name2index = dict((name.replace('CPU\\', ''), index) for index, name in enumerate(readnames))
The difference here is instead of name, you should have name.replace('CPU\\', ''), which get rid of the CPU\ part.
Update 2
I reworked your code to use csv.DictReader and csv.DictWriter. I also assume that "CPU\% Prvileged Time" will be transformed into "Other". If that is not the case, you can fix it in the transformer dictionary.
import csv
import os
def rename_columns(row):
""" Take a row (dictionary) of data and return a new row with columns renamed """
transformer = {
'CPU\\% User Time': '% User Time',
'CPU\\% Processor Time': '% Processor Time',
'CPU\\% Privileged Time': 'Other',
}
new_row = {transformer.get(k, k): v for k, v in row.items()}
return new_row
def reorder_csv(path, title, input_file):
header = dict(
CPU=["% Processor Time", "% User Time", "Other"],
Memory=["Available Bytes", "Pages/sec", "Pages Output/sec", "Pages Input/sec", "Page Faults/sec"],
)
reader = csv.DictReader(input_file)
output_filename = os.path.join(path, '{}_reorder2.csv'.format(title))
with open(output_filename, 'wb') as outfile:
# Create a new writer where each row is a dictionary.
# If the row contains extra keys, ignore them
writer = csv.DictWriter(outfile, header[title], extrasaction='ignore')
writer.writeheader()
for row in reader:
# Each row is a dictionary, not list
print row
row = rename_columns(row)
print row
print
writer.writerow(row)