Editing a downloaded CSV in memory before writing - python

Forewarning: I am very new to Python and programming in general. I am trying to use Python 3 to get some CSV data and making some changes to it before writing it to a file. My problem lies in accessing the CSV data from a variable, like so:
import csv
import requests
csvfile = session.get(url)
reader = csv.reader(csvfile.content)
for row in reader:
do(something)
This returns:
_csv.Error: iterator should return strings, not int (did you open the file in text mode?)
Googling revealed that I should be feeding the reader text instead of bytes, so I also attempted:
reader = csv.reader(csvfile.text)
This also does not work as the loop works through it letter by letter instead of line by line. I also experimented with TextIOWrapper and similar options with no success. The only way I have managed to get this to work is by writing the data to a file, reading it, and then making changes, like so:
csvfile = session.get(url)
with open("temp.txt", 'wb') as f:
f.write(csvfile.content)
with open("temp.txt", 'rU', encoding="utf8") as data:
reader = csv.reader(data)
for row in reader:
do(something)
I feel like this is far from the most optimal way of doing this, even if it works. What is the proper way to read and edit the CSV data directly from memory, without having to save it to a temporary file?

you don't have to write to a temp file, here is what I would do, using the "csv" and "requests" modules:
import csv
import requests
__csvfilepathname__ = r'c:\test\test.csv'
__url__ = 'https://server.domain.com/test.csv'
def csv_reader(filename, enc = 'utf_8'):
with open(filename, 'r', encoding = enc) as openfileobject:
reader = csv.reader(openfileobject)
for row in reader:
#do something
print(row)
return
def csv_from_url(url):
line = ''
datalist = []
s = requests.Session()
r = s.get(url)
for x in r.text.replace('\r',''):
if not x[0] == '\n':
line = line + str(x[0])
else:
datalist.append(line)
line = ''
datalist.append(line)
# at this point you already have a data list 'datalist'
# no need really to use the csv.reader object, but here goes:
reader = csv.reader(datalist)
for row in reader:
#do something
print(row)
return
def main():
csv_reader(__csvfilepathname__)
csv_from_url(__url__)
return
if __name__ == '__main__':
main ()
not very pretty, and probably not very good in regards to memory/performance, depending on how "big" your csv/data is
HTH, Edwin.

Related

Looping through two sets of data to conditionally append to CSV file

I'm writing a script that should: 1) open a CSV file 2) loop through some eBay results data 3) write details from this data to the same file if it matches a search term and if it's not already present, but it has a few issues:
The headers are not written if not present, so I'm manually adding them. I was previously using DictWriter() with writeheader(), but fieldnames was required, so they were written each time
The if result_id != existing_result_id: condition doesn't work in a nested loop, so the rows aren't written. if any(x in result_title.upper() for x in search_terms): does currently work, but I'm not sure how to combine them before writing the rows
import csv
import smtplib
from requests_html import HTMLSession
SEARCH_TERMS = ['one', 'two']
session = HTMLSession()
response = session.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=blink+182&_sacat=0&_sop=10&rt=nc&LH_PrefLoc=2')
results = response.html.find('ul.srp-results li.s-item')
with open('ebay-results.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader) # Skip headers
with open('ebay-results.csv', 'a') as csv_file_updated:
# field_names = ['Title', 'Link', 'ID']
csv_writer = csv.writer(csv_file_updated)
search_terms = [x.upper() for x in SEARCH_TERMS]
for result in results:
result_title = result.find('h3.s-item__title', first=True).text.replace('New Listing', '')
result_link = result.find('a.s-item__link', first=True).attrs['href'].split('?')[0]
result_id = result_link.split('/')[4].split('?')[0]
result_info = [result_title, result_link, result_id]
if any(x in result_title.upper() for x in search_terms):
for line in csv_reader:
existing_result_id = line[2]
if result_id != existing_result_id:
csv_writer.writerow(result_info)
send_email(search_terms, result_link)
There's a few issues with your code:
you create a file handle to read a file and then another file handle to append to the same file; that's dodgy at best, do you expect the reader to read lines you've appended? What's the purpose?
you exhaust the reader with for line in csv_reader: for every result, which will only work once, since you don't seek the start of the file before rereading; however, why reread the file over and over anyway?
standard Python indentation is 4 deep, you would do well to follow convention, as not doing so just makes your code harder to read, maintain and reuse.
It appears you simply want to write results for identifiers you haven't written results for previously to the file. Why not do this:
read the results once, keeping the identifiers for a quick lookup
then loop over the results, writing results with new identifiers
This is likely what you were after:
import csv
from requests_html import HTMLSession
SEARCH_TERMS = ['one', 'two']
session = HTMLSession()
response = session.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=blink+182&_sacat=0&_sop=10&rt=nc&LH_PrefLoc=2')
results = response.html.find('ul.srp-results li.s-item')
with open('ebay-results.csv', 'r+', newline='') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader) # Skip headers
existing_ids = [rec[2] for rec in csv_reader]
# field_names = ['Title', 'Link', 'ID']
csv_writer = csv.writer(csv_file)
search_terms = [x.upper() for x in SEARCH_TERMS]
for result in results:
result_title = result.find('h3.s-item__title', first=True).text.replace('New Listing', '')
result_link = result.find('a.s-item__link', first=True).attrs['href'].split('?')[0]
result_id = result_link.split('/')[4].split('?')[0]
result_info = [result_title, result_link, result_id]
if any(x in result_title.upper() for x in search_terms):
if result_id not in existing_ids:
csv_writer.writerow(result_info)
# not implemented, but not relevant to the question
# send_email(search_terms, result_link)
You asked how to deal with the headers, another issue might be that the .csv doesn't exist when first running. Something like this would solve both:
from pathlib import Path
# create file if it doesn't exist, write header to it
if not Path('ebay-results.csv').is_file():
with open('ebay-results.csv', 'w') as csv_file:
csv_file.write('Title,Link,ID\n')
# reopen the file for reading, which now must exist, and has a header
with open('ebay-results.csv', 'r+', newline='') as csv_file:
csv_reader = csv.reader(csv_file)
field_names = next(csv_reader)

Combine two python scripts for web search

I'm trying to download files from a site and due to search result limitations (max 300), I need to search each item individually. I have a csv file that has a complete list which I've written some basic code to return the ID# column.
With some help, I've got another script that iterates through each search result and downloads a file. What I need to do now is to combine the two so that it will search each individual ID# and download the file.
I know my loop is messed up here, I just can't figure out where and if I'm even looping in the right order
import requests, json, csv
faciltiyList = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
faciltiyList.append(searchterm[0])
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
r = requests.get(url+"?term="+str(searchterm))
searchresults = json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The original code I have is here:
import requests, json
searchterm="ALAMEDA (COUNTY)"
url="https://siera.oshpd.ca.gov/FindFacility.aspx"
r=requests.get(url+"?term="+searchterm)
searchresults=json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id=report['RPT_ID']
reporturl=f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r=requests.get(reporturl)
a=r.headers['Content-Disposition']
filename=a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The searchterm ="ALAMEDA (COUNTY)" results in more than 300 results, so I'm trying to replace "ALAMEDA (COUNTY)" with a list that'll run through each name (ID# in this case) so that I'll get just one result, then run again for the next on the list
CSV - just 1 line
Tested with a CSV file with just 1 line:
406014324,"HOLISTIC PALLIATIVE CARE, INC.",550004188,Parent Facility,5707 REDWOOD RD,OAKLAND,94619,1,ALAMEDA,Not Applicable,,Open,1/1/2018,Home Health Agency/Hospice,Hospice,37.79996,-122.17075
Python code
This script reads the IDs from the CSV file. Then, it fetches the results from URL and finally writes the desired contents to the disk.
import requests, json, csv
# read Ids from csv
facilityIds = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
facilityIds.append(searchterm[0])
# fetch and write file contents
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
for facilityId in facilityIds:
r = requests.get(url+"?term="+str(facilityId))
reports = json.loads(r.content.decode('utf-8'))
# print(f"reports = {reports}")
for report in reports:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
# print(f"filename = {filename}")
with open(filename, "wb") as o:
o.write(r.content)
Repl.it link

How to update a CSV file using its package

Here's my code:
def update_win():
#GET EVERY STRING VALUES NEEDED FROM INPUTBAR
stud_ID = str(ID_num.get())
stud_name = str(name.get())
stud_course = str(Crs.get())
stud_year = str(Yr.get())
searchID = str(sID_num.get())#ID NUMBER FOR SEARCH
filename = str(files.get())#FILENAME
tempfile = NamedTemporaryFile(mode='w', delete=False)
fields = ['ID', 'Name', 'Course', 'Year']
with open(filename, 'r') as csvfile, tempfile:
reader = csv.DictReader(csvfile, fieldnames=fields)
writer = csv.DictWriter(tempfile, fieldnames=fields)
for row in reader:
if row['ID'] == searchID:
row['Name'], row['Course'], row['Year'] = stud_name, stud_course, stud_year
msg = Label(upd_win, text="Update Successful", font="fixedsys 12 bold").place(x=3,y=200)
row = {'ID': row['ID'], 'Name': row['Name'], 'Course': row['Course'], 'Year': row['Year']}
writer.writerow(row)
shutil.move(tempfile.name, filename)
So this code is an UPDATE, it searches that ID number from the CSV, and shows its rows via a GUI as you can see its not print but Label, after that it prompts the user to enter the new ID number, new name, new course, and new year, you want to replace to the row you have selected.
It does get through, but the value doesn't change. Any ideas what happened here and how I fix it?
Sometimes you need to isolate your problem by writing separate scripts to test your processes. This exercise is the first step to creating a Minimal, Complete, and Verifiable example for posting here. Oftentimes, creating an mcve helps you find the problem even before you post the question.
Here is a script to test whether the read-modify-write process works:
from tempfile import NamedTemporaryFile
import io
s = '''one, two, three
1,2,3
4,5,6
7,8,9
10,11,12
'''
data_csv = io.StringIO(s)
g = NamedTemporaryFile(mode = 'w', delete = False)
fields = ['a', 'b', 'c']
# read-modify-write
with data_csv as f, g:
w = csv.DictWriter(g, fields)
w.writeheader()
r = csv.DictReader(f, fields)
for line in r:
line['b'] = 'bar'
w.writerow(line)
# test - did the modifications get written to a temp file?
with open(g.name) as f:
print(f.read())
Which does seem to be working, the tempfile has modified data in it.
Maybe HOW you modified the data is the problem - but changing the test script to match the form of your code also works fine
...
for line in r:
line['a'], line['b'], line['c'] = line['a'], 'foo', line['c']
line = {'a':line['a'], 'b':line['b'], 'c':line['c']}
w.writerow(line)
Assuming all the .get()'s in the first lines of the function are working, filename in the line
shutil.move(tempfile.name, filename)
must not have the correct path.
OR the conditional
if row['ID'] == searchID:
isn't working.
Food for thought:
Moving code into functions, like the read-modify-write portion, can not only help with readability, it can make testing easier.
update_win() works by using a side effect (shutil.move(tempfile.name, filename)) instead of returning something that can be acted on. Side effects can make testing harder.
That isn't necessarily bad (sometimes it is practical), you just need to be aware that you are doing it.
You have opened both files for 'r' read and you are trying to write to temp file.
You can use 'r+' mode with one file handle in order to read and write to a file
...
with open(filename, 'r+') as csvfile:
reader = csv.DictReader(csvfile, fieldnames=fields)
writer = csv.DictWriter(csvfile, fieldnames=fields)
...

Append JSON to file

I am trying to append values to a json file. How can i append the data? I have been trying so many ways but none are working ?
Code:
def all(title,author,body,type):
title = "hello"
author = "njas"
body = "vgbhn"
data = {
"id" : id,
"author": author,
"body" : body,
"title" : title,
"type" : type
}
data_json = json.dumps(data)
#data = ast.literal_eval(data)
#print data_json
if(os.path.isfile("offline_post.json")):
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
else:
open('offline_post.json', 'a')
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
How can I append data to json file when this function is called?
I suspect you left out that you're getting a TypeError in the blocks where you're trying to write the file. Here's where you're trying to write:
with open('offline_post.json','a') as f:
new = json.loads(f)
new.update(a_dict)
json.dump(new,f)
There's a couple of problems here. First, you're passing a file object to the json.loads command, which expects a string. You probably meant to use json.load.
Second, you're opening the file in append mode, which places the pointer at the end of the file. When you run the json.load, you're not going to get anything because it's reading at the end of the file. You would need to seek to 0 before loading (edit: this would fail anyway, as append mode is not readable).
Third, when you json.dump the new data to the file, it's going to append it to the file in addition to the old data. From the structure, it appears you want to replace the contents of the file (as the new data contains the old data already).
You probably want to use r+ mode, seeking back to the start of the file between the read and write, and truncateing at the end just in case the size of the data structure ever shrinks.
with open('offline_post.json', 'r+') as f:
new = json.load(f)
new.update(a_dict)
f.seek(0)
json.dump(new, f)
f.truncate()
Alternatively, you can open the file twice:
with open('offline_post.json', 'r') as f:
new = json.load(f)
new.update(a_dict)
with open('offline_post.json', 'w') as f:
json.dump(new, f)
This is a different approach, I just wanted to append without reloading all the data. Running on a raspberry pi so want to look after memory. The test code -
import os
json_file_exists = 0
filename = "/home/pi/scratch_pad/test.json"
# remove the last run json data
try:
os.remove(filename)
except OSError:
pass
count = 0
boiler = 90
tower = 78
while count<10:
if json_file_exists==0:
# create the json file
with open(filename, mode = 'w') as fw:
json_string = "[\n\t{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.write(json_string)
json_file_exists=1
else:
# append to the json file
char = ""
boiler = boiler + .01
tower = tower + .02
while(char<>"}"):
with open(filename, mode = 'rb+') as f:
f.seek(-1,2)
size=f.tell()
char = f.read()
if char == "}":
break
f.truncate(size-1)
with open(filename, mode = 'a') as fw:
json_string = "\n\t,{'boiler':"+str(boiler)+",'tower':"+str(tower)+"}\n]"
fw.seek(-1, os.SEEK_END)
fw.write(json_string)
count = count + 1

extracting data from CSV file using a reference

I have a csv file with several hundred organism IDs and a second csv file with several thousand organism IDs and additional characteristics (taxonomic information, abundances per sample, etc)
I am trying to write a code that will extract the information from the larger csv using the smaller csv file as a reference. Meaning it will look at both smaller and larger files, and if the IDs are in both files, it will extract all the information form the larger file and write that in a new file (basically write the entire row for that ID).
so far I have written the following, and while the code does not error out on me, I get a blank file in the end and I don't exactly know why. I am a graduate student that knows some simple coding but I'm still very much a novice,
thank you
import sys
import csv
import os.path
SparCCnames=open(sys.argv[1],"rU")
OTU_table=open(sys.argv[2],"rU")
new_file=open(sys.argv[3],"w")
Sparcc_OTUs=csv.writer(new_file)
d=csv.DictReader(SparCCnames)
ids=csv.DictReader(OTU_table)
for record in ids:
idstopull=record["OTUid"]
if idstopull[0]=="OTUid":
continue
if idstopull[0] in d:
new_id.writerow[idstopull[0]]
SparCCnames.close()
OTU_table.close()
new_file.close()
I'm not sure what you're trying to do in your code but you can try this:
def csv_to_dict(csv_file_path):
csv_file = open(csv_file_path, 'rb')
csv_file.seek(0)
sniffdialect = csv.Sniffer().sniff(csv_file.read(10000), delimiters='\t,;')
csv_file.seek(0)
dict_reader = csv.DictReader(csv_file, dialect=sniffdialect)
csv_file.seek(0)
dict_data = []
for record in dict_reader:
dict_data.append(record)
csv_file.close()
return dict_data
def dict_to_csv(csv_file_path, dict_data):
csv_file = open(csv_file_path, 'wb')
writer = csv.writer(csv_file, dialect='excel')
headers = dict_data[0].keys()
writer.writerow(headers)
# headers must be the same with dat.keys()
for dat in dict_data:
line = []
for field in headers:
line.append(dat[field])
writer.writerow(line)
csv_file.close()
if __name__ == "__main__":
big_csv = csv_to_dict('/path/to/big_csv_file.csv')
small_csv = csv_to_dict('/path/to/small_csv_file.csv')
output = []
for s in small_csv:
for b in big_csv:
if s['id'] == b['id']:
output.append(b)
if output:
dict_to_csv('/path/to/output.csv', output)
else:
print "Nothing."
Hope that will help.
You need to read the data into a data structure, assuming OTUid is unique you can store this into a dictionary for fast lookup:
with open(sys.argv[1],"rU") as SparCCnames:
d = csv.DictReader(SparCCnames)
fieldnames = d.fieldnames
data = {i['OTUid']: i for i in d}
with open(sys.argv[2],"rU") as OTU_table, open(sys.argv[3],"w") as new_file:
Sparcc_OTUs = csv.DictWriter(new_file, fieldnames)
ids = csv.DictReader(OTU_table)
for record in ids:
if record['OTUid'] in data:
Sparcc_OTUs.writerow(data[record['OTUid']])
Thank you everyone for your help. I played with things and consulted with an advisor, and finally got a working script. I am posting it in case it helps someone else in the future.
Thanks!
import sys
import csv
input_file = csv.DictReader(open(sys.argv[1], "rU")) #has all info
ref_list = csv.DictReader(open(sys.argv[2], "rU")) #reference list
output_file = csv.DictWriter(
open(sys.argv[3], "w"), input_file.fieldnames) #to write output file with headers
output_file.writeheader() #write headers in output file
white_list={} #create empty dictionary
for record in ref_list: #for every line in my reference list
white_list[record["Sample_ID"]] = None #store into the dictionary the ID's as keys
for record in input_file: #for every line in my input file
record_id = record["Sample_ID"] #store ID's into variable record_id
if (record_id in white_list): #if the ID is in the reference list
output_file.writerow(record) #write the entire row into a new file
else: #if it is not in my reference list
continue #ignore it and continue iterating through the file

Categories

Resources