Looping through two sets of data to conditionally append to CSV file - python

I'm writing a script that should: 1) open a CSV file 2) loop through some eBay results data 3) write details from this data to the same file if it matches a search term and if it's not already present, but it has a few issues:
The headers are not written if not present, so I'm manually adding them. I was previously using DictWriter() with writeheader(), but fieldnames was required, so they were written each time
The if result_id != existing_result_id: condition doesn't work in a nested loop, so the rows aren't written. if any(x in result_title.upper() for x in search_terms): does currently work, but I'm not sure how to combine them before writing the rows
import csv
import smtplib
from requests_html import HTMLSession
SEARCH_TERMS = ['one', 'two']
session = HTMLSession()
response = session.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=blink+182&_sacat=0&_sop=10&rt=nc&LH_PrefLoc=2')
results = response.html.find('ul.srp-results li.s-item')
with open('ebay-results.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader) # Skip headers
with open('ebay-results.csv', 'a') as csv_file_updated:
# field_names = ['Title', 'Link', 'ID']
csv_writer = csv.writer(csv_file_updated)
search_terms = [x.upper() for x in SEARCH_TERMS]
for result in results:
result_title = result.find('h3.s-item__title', first=True).text.replace('New Listing', '')
result_link = result.find('a.s-item__link', first=True).attrs['href'].split('?')[0]
result_id = result_link.split('/')[4].split('?')[0]
result_info = [result_title, result_link, result_id]
if any(x in result_title.upper() for x in search_terms):
for line in csv_reader:
existing_result_id = line[2]
if result_id != existing_result_id:
csv_writer.writerow(result_info)
send_email(search_terms, result_link)

There's a few issues with your code:
you create a file handle to read a file and then another file handle to append to the same file; that's dodgy at best, do you expect the reader to read lines you've appended? What's the purpose?
you exhaust the reader with for line in csv_reader: for every result, which will only work once, since you don't seek the start of the file before rereading; however, why reread the file over and over anyway?
standard Python indentation is 4 deep, you would do well to follow convention, as not doing so just makes your code harder to read, maintain and reuse.
It appears you simply want to write results for identifiers you haven't written results for previously to the file. Why not do this:
read the results once, keeping the identifiers for a quick lookup
then loop over the results, writing results with new identifiers
This is likely what you were after:
import csv
from requests_html import HTMLSession
SEARCH_TERMS = ['one', 'two']
session = HTMLSession()
response = session.get('https://www.ebay.com/sch/i.html?_from=R40&_nkw=blink+182&_sacat=0&_sop=10&rt=nc&LH_PrefLoc=2')
results = response.html.find('ul.srp-results li.s-item')
with open('ebay-results.csv', 'r+', newline='') as csv_file:
csv_reader = csv.reader(csv_file)
next(csv_reader) # Skip headers
existing_ids = [rec[2] for rec in csv_reader]
# field_names = ['Title', 'Link', 'ID']
csv_writer = csv.writer(csv_file)
search_terms = [x.upper() for x in SEARCH_TERMS]
for result in results:
result_title = result.find('h3.s-item__title', first=True).text.replace('New Listing', '')
result_link = result.find('a.s-item__link', first=True).attrs['href'].split('?')[0]
result_id = result_link.split('/')[4].split('?')[0]
result_info = [result_title, result_link, result_id]
if any(x in result_title.upper() for x in search_terms):
if result_id not in existing_ids:
csv_writer.writerow(result_info)
# not implemented, but not relevant to the question
# send_email(search_terms, result_link)
You asked how to deal with the headers, another issue might be that the .csv doesn't exist when first running. Something like this would solve both:
from pathlib import Path
# create file if it doesn't exist, write header to it
if not Path('ebay-results.csv').is_file():
with open('ebay-results.csv', 'w') as csv_file:
csv_file.write('Title,Link,ID\n')
# reopen the file for reading, which now must exist, and has a header
with open('ebay-results.csv', 'r+', newline='') as csv_file:
csv_reader = csv.reader(csv_file)
field_names = next(csv_reader)

Related

csv header in python only on the top row?

i have written a python program which makes an api call to a webserver once every minute and then parse the json response and saves parsed values in to the csv files.
here is the code that is saving the values into the csv file :
with open('data.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
how can i make it so that it saves the header only once on the top most row and not on every row ?
UPDATE:
here is a bit of more code to make api call and write the data to file :
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.cron import CronTrigger
import requests
import json
import csv
from datetime import datetime
def fn():
print("Starting...")
session_id = "auZsJ4F2RsQNJxSPTMDt2238324"
Outlook='http://www.myfxbook.com/api/get-community-outlook.json?session=' + session_id
Outlook_response = requests.get(Outlook)
Outlook_data = Outlook_response.json()['symbols']
now = datetime.now()
current_time = now.strftime("%H:%M")
EURUSD=Outlook_data[0]
SHORTPERC=EURUSD['shortPercentage']
LONGPERC =EURUSD['longPercentage']
SHORTvolume=EURUSD['shortVolume']
longVolume=EURUSD['longVolume']
longPositions=EURUSD['longPositions']
shortPositions=EURUSD['shortPositions']
with open('data.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
with open('data1.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([SHORTvolume, longVolume])
with open('data2.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([SHORTPERC, LONGPERC])
i cant post the full code cuz it will be very ugly since its around 700 lines long , but the above mentioned code should work to create the csv file
this is how one of my csv files look :
07:11,31,69,555.55,1265.14,4750,2607
07:12,31,69,555.55,1265.16,4751,2607
07:13,31,69,555.55,1265.16,4751,2607
07:14,30,70,555.56,1267.36,4752,2608
07:15,30,70,555.56,1267.36,4752,2608
07:16,30,70,555.56,1267.36,4752,2608
07:17,30,70,555.46,1267.36,4752,2607
07:18,31,69,558.61,1267.36,4752,2610
07:19,31,69,558.61,1267.37,4753,2610
07:20,31,69,561.58,1267.37,4753,2611
07:21,31,69,561.61,1267.37,4753,2613
07:22,31,69,561.65,1267.37,4753,2614
07:23,31,69,561.65,1267.36,4752,2614
this is just part of the csv file , more rows keep adding as time passes
EDIT 2:
answer suggested by Sparkofska seems to work but somehow it ends up giving an empty row in between every line like this:
Time,ShortPer,LongPer,ShortVolume,LongVolume,ShortPosition,LongPosition
05:47,44,56,19528.8,24789.27,65223,48630
05:48,44,56,19529.04,24789.27,65223,48633
code :
EURUSD=Outlook_data[0]
SHORTPERC=EURUSD['shortPercentage']
LONGPERC =EURUSD['longPercentage']
SHORTvolume=EURUSD['shortVolume']
longVolume=EURUSD['longVolume']
longPositions=EURUSD['longPositions']
shortPositions=EURUSD['shortPositions']
filename='EURUSD.csv';
def write_row_header_aware(filename, row):
if not os.path.exists(filename) or os.stat(filename).st_size == 0:
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow(['Time', 'ShortPer', 'LongPer','ShortVolume','LongVolume','ShortPosition','LongPosition'])
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
write_row_header_aware(filename, [current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
print("done...")
You could wrap the writerow function to have it automatically add the header if needed.
If your output csv file is not empty, we can assert the header was already written and simply append the row. Otherwise (file not exist or empty) we write the header before appending the row.
import os
def write_row_header_aware(filename, row):
# in case file doesn't exist or is empty
if not os.path.exists(filename) or os.stat(filename).st_size == 0:
# write header
with open(filename, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(['current_time', 'SHORTPERC', 'LONGPERC', ...])
# write line as usual
with open(filename, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(row)
write_row_header_aware('data.csv', [current_time, SHORTPERC, LONGPERC, ...])
Please make a check to know if the file exists, if it already exists use append to write rows to file else write the headers. By this way you could avoid writing headers multiple times. Please refer to this link

How to update a CSV file using its package

Here's my code:
def update_win():
#GET EVERY STRING VALUES NEEDED FROM INPUTBAR
stud_ID = str(ID_num.get())
stud_name = str(name.get())
stud_course = str(Crs.get())
stud_year = str(Yr.get())
searchID = str(sID_num.get())#ID NUMBER FOR SEARCH
filename = str(files.get())#FILENAME
tempfile = NamedTemporaryFile(mode='w', delete=False)
fields = ['ID', 'Name', 'Course', 'Year']
with open(filename, 'r') as csvfile, tempfile:
reader = csv.DictReader(csvfile, fieldnames=fields)
writer = csv.DictWriter(tempfile, fieldnames=fields)
for row in reader:
if row['ID'] == searchID:
row['Name'], row['Course'], row['Year'] = stud_name, stud_course, stud_year
msg = Label(upd_win, text="Update Successful", font="fixedsys 12 bold").place(x=3,y=200)
row = {'ID': row['ID'], 'Name': row['Name'], 'Course': row['Course'], 'Year': row['Year']}
writer.writerow(row)
shutil.move(tempfile.name, filename)
So this code is an UPDATE, it searches that ID number from the CSV, and shows its rows via a GUI as you can see its not print but Label, after that it prompts the user to enter the new ID number, new name, new course, and new year, you want to replace to the row you have selected.
It does get through, but the value doesn't change. Any ideas what happened here and how I fix it?
Sometimes you need to isolate your problem by writing separate scripts to test your processes. This exercise is the first step to creating a Minimal, Complete, and Verifiable example for posting here. Oftentimes, creating an mcve helps you find the problem even before you post the question.
Here is a script to test whether the read-modify-write process works:
from tempfile import NamedTemporaryFile
import io
s = '''one, two, three
1,2,3
4,5,6
7,8,9
10,11,12
'''
data_csv = io.StringIO(s)
g = NamedTemporaryFile(mode = 'w', delete = False)
fields = ['a', 'b', 'c']
# read-modify-write
with data_csv as f, g:
w = csv.DictWriter(g, fields)
w.writeheader()
r = csv.DictReader(f, fields)
for line in r:
line['b'] = 'bar'
w.writerow(line)
# test - did the modifications get written to a temp file?
with open(g.name) as f:
print(f.read())
Which does seem to be working, the tempfile has modified data in it.
Maybe HOW you modified the data is the problem - but changing the test script to match the form of your code also works fine
...
for line in r:
line['a'], line['b'], line['c'] = line['a'], 'foo', line['c']
line = {'a':line['a'], 'b':line['b'], 'c':line['c']}
w.writerow(line)
Assuming all the .get()'s in the first lines of the function are working, filename in the line
shutil.move(tempfile.name, filename)
must not have the correct path.
OR the conditional
if row['ID'] == searchID:
isn't working.
Food for thought:
Moving code into functions, like the read-modify-write portion, can not only help with readability, it can make testing easier.
update_win() works by using a side effect (shutil.move(tempfile.name, filename)) instead of returning something that can be acted on. Side effects can make testing harder.
That isn't necessarily bad (sometimes it is practical), you just need to be aware that you are doing it.
You have opened both files for 'r' read and you are trying to write to temp file.
You can use 'r+' mode with one file handle in order to read and write to a file
...
with open(filename, 'r+') as csvfile:
reader = csv.DictReader(csvfile, fieldnames=fields)
writer = csv.DictWriter(csvfile, fieldnames=fields)
...

Trying to read the first line of CSV file returns ['/']

I have uploaded a CSV file through Django and I trying to read the first line of it. The file is stored on the server in
/tmp/csv_file/test.csv
The file looks like this:
column_1,column_2,column_3
2175,294,Nuristan
2179,299,Sar-e-Pul
I am trying to get the headings of the file like:
absolute_base_file = '/tmp/csv_file/test.csv'
csv_reader = csv.reader(absolute_base_file)
csv_headings = next(csv_reader)
print csv_headings
I only get this in return:
['/']
EDITED
The permissions of the CSV file are:
-rw-rw-r--
Which should be ok.
EDITED AGAIN
Based on the recommendations and the help of #EdChum and #Moses Koledoye
I have checked if the file is read correctly using:
print (os.stat(absolute_base_file).st_size) # returns 64
Then I tried to see if seek(0) and csvfile.read(1) return a single printable character.
print csvfile.seek(0) returns None
print csvfile.read(1) returns 'c'
Then I thought perhaps there is a particular issue with next() function and I tried an alternative:
csv_reader = csv.reader(csvfile)
for row in csv_reader:
print ("csv_reader")
Again this didn't work.
You passed a string instead of a file object which is why you get the slash, change to this:
with open (absolute_base_file) as csvfile:
csv_reader = csv.reader(csvfile)
check the docs
See this working:
In [5]:
import csv
with open (r'c:\data\csv_test.csv') as csvfile:
csv_reader = csv.reader(csvfile)
csv_headings = next(csv_reader)
print (csv_headings)
['column_1', 'column_2', 'column_3']
To successively access each row call next:
In [7]:
import csv
with open (r'c:\data\csv_test.csv') as csvfile:
csv_reader = csv.reader(csvfile)
csv_headings = next(csv_reader)
print (csv_headings)
first_row = next(csv_reader)
print( 'first row: ', first_row)
['column_1', 'column_2', 'column_3']
first row: ['2175', '294', 'Nuristan']
You should pass a file object to your csv.reader not a string literal.
absolute_base_file = open(r'/tmp/csv_file/test.csv') # notice open
csv_reader = csv.reader(absolute_base_file)
csv_headings = next(csv_reader)
print csv_headings

Editing a downloaded CSV in memory before writing

Forewarning: I am very new to Python and programming in general. I am trying to use Python 3 to get some CSV data and making some changes to it before writing it to a file. My problem lies in accessing the CSV data from a variable, like so:
import csv
import requests
csvfile = session.get(url)
reader = csv.reader(csvfile.content)
for row in reader:
do(something)
This returns:
_csv.Error: iterator should return strings, not int (did you open the file in text mode?)
Googling revealed that I should be feeding the reader text instead of bytes, so I also attempted:
reader = csv.reader(csvfile.text)
This also does not work as the loop works through it letter by letter instead of line by line. I also experimented with TextIOWrapper and similar options with no success. The only way I have managed to get this to work is by writing the data to a file, reading it, and then making changes, like so:
csvfile = session.get(url)
with open("temp.txt", 'wb') as f:
f.write(csvfile.content)
with open("temp.txt", 'rU', encoding="utf8") as data:
reader = csv.reader(data)
for row in reader:
do(something)
I feel like this is far from the most optimal way of doing this, even if it works. What is the proper way to read and edit the CSV data directly from memory, without having to save it to a temporary file?
you don't have to write to a temp file, here is what I would do, using the "csv" and "requests" modules:
import csv
import requests
__csvfilepathname__ = r'c:\test\test.csv'
__url__ = 'https://server.domain.com/test.csv'
def csv_reader(filename, enc = 'utf_8'):
with open(filename, 'r', encoding = enc) as openfileobject:
reader = csv.reader(openfileobject)
for row in reader:
#do something
print(row)
return
def csv_from_url(url):
line = ''
datalist = []
s = requests.Session()
r = s.get(url)
for x in r.text.replace('\r',''):
if not x[0] == '\n':
line = line + str(x[0])
else:
datalist.append(line)
line = ''
datalist.append(line)
# at this point you already have a data list 'datalist'
# no need really to use the csv.reader object, but here goes:
reader = csv.reader(datalist)
for row in reader:
#do something
print(row)
return
def main():
csv_reader(__csvfilepathname__)
csv_from_url(__url__)
return
if __name__ == '__main__':
main ()
not very pretty, and probably not very good in regards to memory/performance, depending on how "big" your csv/data is
HTH, Edwin.

extracting data from CSV file using a reference

I have a csv file with several hundred organism IDs and a second csv file with several thousand organism IDs and additional characteristics (taxonomic information, abundances per sample, etc)
I am trying to write a code that will extract the information from the larger csv using the smaller csv file as a reference. Meaning it will look at both smaller and larger files, and if the IDs are in both files, it will extract all the information form the larger file and write that in a new file (basically write the entire row for that ID).
so far I have written the following, and while the code does not error out on me, I get a blank file in the end and I don't exactly know why. I am a graduate student that knows some simple coding but I'm still very much a novice,
thank you
import sys
import csv
import os.path
SparCCnames=open(sys.argv[1],"rU")
OTU_table=open(sys.argv[2],"rU")
new_file=open(sys.argv[3],"w")
Sparcc_OTUs=csv.writer(new_file)
d=csv.DictReader(SparCCnames)
ids=csv.DictReader(OTU_table)
for record in ids:
idstopull=record["OTUid"]
if idstopull[0]=="OTUid":
continue
if idstopull[0] in d:
new_id.writerow[idstopull[0]]
SparCCnames.close()
OTU_table.close()
new_file.close()
I'm not sure what you're trying to do in your code but you can try this:
def csv_to_dict(csv_file_path):
csv_file = open(csv_file_path, 'rb')
csv_file.seek(0)
sniffdialect = csv.Sniffer().sniff(csv_file.read(10000), delimiters='\t,;')
csv_file.seek(0)
dict_reader = csv.DictReader(csv_file, dialect=sniffdialect)
csv_file.seek(0)
dict_data = []
for record in dict_reader:
dict_data.append(record)
csv_file.close()
return dict_data
def dict_to_csv(csv_file_path, dict_data):
csv_file = open(csv_file_path, 'wb')
writer = csv.writer(csv_file, dialect='excel')
headers = dict_data[0].keys()
writer.writerow(headers)
# headers must be the same with dat.keys()
for dat in dict_data:
line = []
for field in headers:
line.append(dat[field])
writer.writerow(line)
csv_file.close()
if __name__ == "__main__":
big_csv = csv_to_dict('/path/to/big_csv_file.csv')
small_csv = csv_to_dict('/path/to/small_csv_file.csv')
output = []
for s in small_csv:
for b in big_csv:
if s['id'] == b['id']:
output.append(b)
if output:
dict_to_csv('/path/to/output.csv', output)
else:
print "Nothing."
Hope that will help.
You need to read the data into a data structure, assuming OTUid is unique you can store this into a dictionary for fast lookup:
with open(sys.argv[1],"rU") as SparCCnames:
d = csv.DictReader(SparCCnames)
fieldnames = d.fieldnames
data = {i['OTUid']: i for i in d}
with open(sys.argv[2],"rU") as OTU_table, open(sys.argv[3],"w") as new_file:
Sparcc_OTUs = csv.DictWriter(new_file, fieldnames)
ids = csv.DictReader(OTU_table)
for record in ids:
if record['OTUid'] in data:
Sparcc_OTUs.writerow(data[record['OTUid']])
Thank you everyone for your help. I played with things and consulted with an advisor, and finally got a working script. I am posting it in case it helps someone else in the future.
Thanks!
import sys
import csv
input_file = csv.DictReader(open(sys.argv[1], "rU")) #has all info
ref_list = csv.DictReader(open(sys.argv[2], "rU")) #reference list
output_file = csv.DictWriter(
open(sys.argv[3], "w"), input_file.fieldnames) #to write output file with headers
output_file.writeheader() #write headers in output file
white_list={} #create empty dictionary
for record in ref_list: #for every line in my reference list
white_list[record["Sample_ID"]] = None #store into the dictionary the ID's as keys
for record in input_file: #for every line in my input file
record_id = record["Sample_ID"] #store ID's into variable record_id
if (record_id in white_list): #if the ID is in the reference list
output_file.writerow(record) #write the entire row into a new file
else: #if it is not in my reference list
continue #ignore it and continue iterating through the file

Categories

Resources