#gets rid of spaces in existing csv headers
def getColumns(readCSV):
return [column.replace(' ','') for column in next(readCSV)]
#insert format used to avoid hard-coded headers in script
def insertData(tableName, columns, readCSV):
print("Inserting Data")
query = 'INSERT INTO {}({}) VALUES ({})'.format(
tableName,
','.join(columns),
','.join('?' * len(columns))
)
for data in readCSV:
cursor.execute(query, data)
con.commit()
def updateTable(csvPath, tableName):
print("Updating table...")
print("Reading file contents and uploading into db table...")
## insert timestamp column into existing csv. Does not incorporate header correctly for timestamp
rows = []
with open(csvPath, 'r', newline='') as csvFile:
readCSV = csv.reader(csvFile, delimiter=',')
for row in readCSV:
rows.append(row)
with open(csvPath, 'w', newline='')as writeFile:
file_write = csv.writer(writeFile)
for val in rows:
timestamp = datetime.now()
val.insert(0, timestamp)
file_write.writerow(val)
with open(csvPath) as csvFile:
readCSV = csv.reader(csvFile, delimiter=',')
columns = getColumns(readCSV)
insertData(tableName, columns, readCSV)
print ("Upload complete")
Above is a snippet of the code I'm working on. I am gathering data from a csv to insert into a SQL database. Currently, the csv does not have a timestamp column and without that, the import wont work as it sees duplicate data. I found a solution at https://www.geeksforgeeks.org/how-to-add-timestamp-to-csv-file-in-python/ for adding a timestamp column and have incorporated it into the code, but it does not add a header for the column. I'm sure it's an easy fix, but I am new to python and cant find the solution anywhere else. Also, if you see something inefficient with the coding in updateTable, let me know so that I can recognize it and learn a better way.
I ended up using a different method with import pandas as pd
def updateTable(csvPath, tableName):
print("Updating table...")
print("Reading file contents and uploading into db table...")
timestamp = datetime.now()
df = pd.read_csv(csvPath)
df.insert(0, 'timestamp',timestamp)
df.to_csv(csvPath, index=False)
with open(csvPath) as csvFile:
readCSV = csv.reader(csvFile, delimiter=',')
columns = getColumns(readCSV)
insertData(tableName, columns, readCSV)
print ("Upload complete")
Related
I am trying to make a function that takes a threshold and determines which names from a csv file of song names and their lyrics that contain human names and the function
should create a csv file named outputfile that contains the number of distinct names, the name of
the song and the artist.
import csv
def findName(thresh, outputFile):
dictNames={}
with open('allNames.csv') as csvfile:
reader = csv.DictReader(csvfile, delimiter="\t")
for row in reader:
if row["name"] in dictNames:
dictNames[row["name"]] +=1
else:
dictNames[row["name"]]=1
with open(outputFile, "w", newline='') as outfile:
headers= ["song", "artist", "year"]
writer=csv.DictWriter(outfile, fieldnames=headers)
writer.writeheader()
for key, val in dictNames.items():
if val>= thresh:
writer.writerow({key: val})
csvfile.close()
outfile.close()
What's the rationale for not using Pandas here?
Not sure I fully understand your question, but I'm thinking something like:
df = pd.read_csv('allNames.csv')
#partition df after threshold
df['index'] = df.index
def partition_return(threshold, df):
df = df.loc[df['index'] >= threshold].reset_index(drop=true)
df = df[['song', 'artist', 'year]]
df['count_names_dist'] = len(df['artist'].unique())
df.to_csv('outfile.csv', index=False)
I am working on a Python script to do some steps.
Right now everything is working except the part that it has to go through all table rows and save each line as a JSON object. The problem is: It only saves the last line. So it's not saving the previous lines. I know where is the problem but don't know how to fix.
Here is the code:
url = 'http://website.com/group1.html'
htmlFile= urlopen(urlGroup1)
soup= BeautifulSoup(htmlGroup1, 'html.parser')
table = soup2.find_all("table", {"class": "sortable employeeList result-table"})[0]
rows = table.find_all('tr')
Filegroup1 = open('group1.csv', 'wt+')
Datagroup1 = csv.writer(Filegroup1)
jsonFilePath = r'group1.json'
def make_json(Filegroup1, jsonFilePath):
data2 = {}
with open('group1.csv', encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
data2 = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(simplejson.dumps([data2], indent=4))
try:
for row in rows:
FilteredRow = []
for cell in row.find_all(['td', 'th']):
FilteredRow.append(cell.get_text().strip())
Datagroup1.writerow(FilteredRow)
finally:
Filegroup1.close()
make_json(Filegroup1, jsonFilePath)
The issue is here:
for rows in csvReader:
data2 = rows
If I change it to the following, it will work!! But it will group each object by Employee ID. Which I don't want that.
for rows in csvReader:
key = rows['Employee Name']
data3[key] = rows
I have some code that scrapes several tables from webpages and then puts the data into several excel files.
I want to also be able to add the company name at the bottom of the excel file. I have worked out how to get the name of the company using companyname = soup.find('h1').text as shown in the first code block below.
One of the excel files is generated from the following code lines:
all_data = {}
#for every table found on the page
for table in soup.select('table.BordCollapseYear2'):
table_name = table.find_previous('b').text
all_data[table_name] = []
#scrape for every row
for tr in table.select('tr'):
row = [td.get_text(strip=True, separator=' ') for td in tr.select('td')]
if len(row) == 7:
all_data[table_name].append(row)
companyname = soup.find('h1').text
with open('data2.csv', 'a', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
spamwriter.writerow(row)
I tried adding the line writerows(companyname) this worked but it separated out each letter. So I think I am nearly there...
Put [] around companyname in writerow().
For example:
with open('data2.csv', 'a', newline='') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
for row in all_data:
spamwriter.writerow(row)
spamwriter.writerow([companyname]) # <-- notice the `[]` around companyname
I have a Problem with continues writing my datas in a csv-file. I want a program that detects, if there is a csv-file for my measurements-data. If not it would be generated. When the csv-file is new generated the datas are written in the csv-file on the column after the header with the variable cycle = 0.
If the csv-file exists, the datas should be written continuously after the last line of the csv. Also the variable cycle should continue.
I have written a program that can detect if there is a file or not but with the continuously lines I have problems.
I hope someone can help me.
# mes = Array with 20 spaces filled with the Numbers 0-19
date = time.strftime("%d/%m/%Y")
def write(cycle, mes):
if os.path.exists('/home/pi/Documents/Ventilatorprüfstand_Programm/out.csv') is True: #does the out.csv existate?
print("Do something")
out = open('out.csv', 'w')
data = [[cycle, mes[0],mes[1],mes[2],mes[3],mes[4],mes[5],mes[6],mes[7],mes[8],mes[9],mes[10],mes[11],mes[12],mes[13],mes[14],mes[15],mes[16],mes[17],mes[18],mes[19], date]]
line = cycle+1
for row in data:
for line in row:
out.write('%s;' % line)
out.write('\n')
out.close()
else:
print("Do another something")
header = lookuptable.names()
out = open('out.csv', 'w')
for row in header:
for column in row:
out.write('%s' % column)
out.write('\t')
out.write('\n')
data = [[cycle, mes[0],mes[1],mes[2],mes[3],mes[4],mes[5],mes[6],mes[7],mes[8],mes[9],mes[10],mes[11],mes[12],mes[13],mes[14],mes[15],mes[16],mes[17],mes[18],mes[19], date]]
for row in data:
for column in row:
out.write('%s;' % column)
out.write('\n')
out.close()`
When opening the file with open() there is the option 'a' to append the new lines to the end:
'a' open for writing, appending to the end of the file if it exists
Here is an example using the csv Python standard library:
import csv
import os
import random
headers = ['cycle', 'date', 'speed', 'temp', 'power']
new_data = [[random.randint(0, 100) for _ in range(3)] for _ in range(2)]
date = '00/01/02'
cycle = 1
# Copy the data and include the date and the cycle number:
full_rows = [ [cycle, date, *row] for row in new_data ]
filename = 'example.csv'
# Check if the file exist, if not create the file with header
if not os.path.exists(filename):
print('creating a new file')
with open(filename, 'w') as csvfile:
csvwriter = csv.writer(csvfile, delimiter=',')
csvwriter.writerow(headers) # add the header
# Append the data to the file
with open(filename, 'a', newline='') as csvfile: # note the 'a' option
csvwriter = csv.writer(csvfile, delimiter=',')
csvwriter.writerows(full_rows)
I have a csv file, l__cyc.csv, that contains this:
trip_id, time, O_lat, O_lng, D_lat, D_lng
130041910101,1300,51.5841153671,0.134444590094,51.5718053872,0.134878021928
130041910102,1335,51.5718053872,0.134878021928,51.5786920389,0.180940040247
130041910103,1600,51.5786920389,0.180940040247,51.5841153671,0.134444590094
130043110201,1500,51.5712712038,0.138532882664,51.5334949484,0.130489470325
130043110202,1730,51.5334949484,0.130489470325,51.5712712038,0.138532882664
And I am trying to pull out separate values, using:
with open('./l__cyc.csv', 'rU') as csvfile:
reader = csv.DictReader(csvfile)
origincoords = ['{O_lat},{O_lng}'.format(**row) for row in reader]
with open('./l__cyc.csv', 'rU') as csvfile:
reader = csv.DictReader(csvfile)
trip_id = ['{trip_id}'.format(**row) for row in reader]
with open('./l__cyc.csv', 'rU') as csvfile:
reader = csv.DictReader(csvfile)
destinationcoords = ['{D_lat},{D_lng}'.format(**row) for row in reader]
Where origincoords should be 51.5841153671, 0.134444590094,
trip_id should be 130041910101, and destinationcoords should be
51.5718053872, 0.134878021928.
However, I get a KeyError:
KeyError: 'O_lat'
Is this something simple and there's something fundamental I'm misunderstanding?
You just avoid the space between headers
trip_id,time,O_lat,O_lng,D_lat,D_lng
OR
reader = csv.DictReader(csvfile, skipinitialspace=True)
First things first, you get the key error, because the key does not exist in your dictionary.
Next, I would advise against running through the file 3 times, when you can do it a single time!
For me it worked, when I added the fieldnames to the reader.
import csv
from cStringIO import StringIO
src = """trip_id, time, O_lat, O_lng, D_lat, D_lng
130041910101,1300,51.5841153671,0.134444590094,51.5718053872,0.134878021928
130041910102,1335,51.5718053872,0.134878021928,51.5786920389,0.180940040247
130041910103,1600,51.5786920389,0.180940040247,51.5841153671,0.134444590094
130043110201,1500,51.5712712038,0.138532882664,51.5334949484,0.130489470325
130043110202,1730,51.5334949484,0.130489470325,51.5712712038,0.138532882664
"""
f = StringIO(src)
# determine the fieldnames
fieldnames= "trip_id,time,O_lat,O_lng,D_lat,D_lng".split(",")
# read the file
reader = csv.DictReader(f, fieldnames=fieldnames)
# storage
origincoords = []
trip_id = []
destinationcoords = []
# iterate the rows
for row in reader:
origincoords.append('{O_lat},{O_lng}'.format(**row))
trip_id.append('{trip_id}'.format(**row))
destinationcoords.append('{D_lat},{D_lng}'.format(**row))
# pop the header off the list
origincoords.pop(0)
trip_id.pop(0)
destinationcoords.pop(0)
# show the result
print origincoords
print trip_id
print destinationcoords
I don't really know what you are trying to achieve there, but I'm sure there is a better way of doing it!