Django import CSV with carriage return - python

I'm creating a Django app and I need to import several *.csv files.
One's of this file has this structure:
id|value (header)
12|¤this is the
value¤
34|¤this is another
value¤
I use this code for parse the file:
try:
csvfile = open(path, "r", encoding='utf-16')
except IOError:
return False
cursor.copy_from(csvfile , tblname, columns=['id', 'value'], sep='|')
But when I try to parse this file, it gave me this error:
psycopg2.DataError: ERROR: missing data for the column "value"
Is there a way to parse this file keeping carriage return inside text identifier ('¤')?

You could use Pythons csv module for reading that.
import csv
try:
csvfile = open(path, newline='')
except IOError:
return False
csvreader = csv.reader(csvfile, delimiter='|', quotechar='¤')
for row in csvreader:
print(', '.join(row)) # or do something else with the row of data.

One approach would be to build up the entries yourself as follows:
blocks = []
block = []
with open('input.csv') as f_input:
for row in f_input:
if '|' in row:
if len(block):
blocks.append(''.join(block).strip('\n').split('|'))
block = []
block.append(row)
else:
block.append(row)
if len(block):
blocks.append(''.join(block).strip('\n').split('|'))
print(blocks)
This would produce a list of blocks as follows:
[['id', 'value (header)'], ['12', '¤this is the\nvalue¤'], ['34', '¤this is another\nvalue¤']]

Related

csv header in python only on the top row?

i have written a python program which makes an api call to a webserver once every minute and then parse the json response and saves parsed values in to the csv files.
here is the code that is saving the values into the csv file :
with open('data.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
how can i make it so that it saves the header only once on the top most row and not on every row ?
UPDATE:
here is a bit of more code to make api call and write the data to file :
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.cron import CronTrigger
import requests
import json
import csv
from datetime import datetime
def fn():
print("Starting...")
session_id = "auZsJ4F2RsQNJxSPTMDt2238324"
Outlook='http://www.myfxbook.com/api/get-community-outlook.json?session=' + session_id
Outlook_response = requests.get(Outlook)
Outlook_data = Outlook_response.json()['symbols']
now = datetime.now()
current_time = now.strftime("%H:%M")
EURUSD=Outlook_data[0]
SHORTPERC=EURUSD['shortPercentage']
LONGPERC =EURUSD['longPercentage']
SHORTvolume=EURUSD['shortVolume']
longVolume=EURUSD['longVolume']
longPositions=EURUSD['longPositions']
shortPositions=EURUSD['shortPositions']
with open('data.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
with open('data1.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([SHORTvolume, longVolume])
with open('data2.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([SHORTPERC, LONGPERC])
i cant post the full code cuz it will be very ugly since its around 700 lines long , but the above mentioned code should work to create the csv file
this is how one of my csv files look :
07:11,31,69,555.55,1265.14,4750,2607
07:12,31,69,555.55,1265.16,4751,2607
07:13,31,69,555.55,1265.16,4751,2607
07:14,30,70,555.56,1267.36,4752,2608
07:15,30,70,555.56,1267.36,4752,2608
07:16,30,70,555.56,1267.36,4752,2608
07:17,30,70,555.46,1267.36,4752,2607
07:18,31,69,558.61,1267.36,4752,2610
07:19,31,69,558.61,1267.37,4753,2610
07:20,31,69,561.58,1267.37,4753,2611
07:21,31,69,561.61,1267.37,4753,2613
07:22,31,69,561.65,1267.37,4753,2614
07:23,31,69,561.65,1267.36,4752,2614
this is just part of the csv file , more rows keep adding as time passes
EDIT 2:
answer suggested by Sparkofska seems to work but somehow it ends up giving an empty row in between every line like this:
Time,ShortPer,LongPer,ShortVolume,LongVolume,ShortPosition,LongPosition
05:47,44,56,19528.8,24789.27,65223,48630
05:48,44,56,19529.04,24789.27,65223,48633
code :
EURUSD=Outlook_data[0]
SHORTPERC=EURUSD['shortPercentage']
LONGPERC =EURUSD['longPercentage']
SHORTvolume=EURUSD['shortVolume']
longVolume=EURUSD['longVolume']
longPositions=EURUSD['longPositions']
shortPositions=EURUSD['shortPositions']
filename='EURUSD.csv';
def write_row_header_aware(filename, row):
if not os.path.exists(filename) or os.stat(filename).st_size == 0:
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow(['Time', 'ShortPer', 'LongPer','ShortVolume','LongVolume','ShortPosition','LongPosition'])
with open(filename, 'a') as file:
writer = csv.writer(file)
writer.writerow([current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
write_row_header_aware(filename, [current_time,SHORTPERC, LONGPERC, SHORTvolume, longVolume, longPositions, shortPositions])
print("done...")
You could wrap the writerow function to have it automatically add the header if needed.
If your output csv file is not empty, we can assert the header was already written and simply append the row. Otherwise (file not exist or empty) we write the header before appending the row.
import os
def write_row_header_aware(filename, row):
# in case file doesn't exist or is empty
if not os.path.exists(filename) or os.stat(filename).st_size == 0:
# write header
with open(filename, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(['current_time', 'SHORTPERC', 'LONGPERC', ...])
# write line as usual
with open(filename, 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow(row)
write_row_header_aware('data.csv', [current_time, SHORTPERC, LONGPERC, ...])
Please make a check to know if the file exists, if it already exists use append to write rows to file else write the headers. By this way you could avoid writing headers multiple times. Please refer to this link

Print csv to console

I am reading in a csv file with ten lines to transfer to JSON, console output as below. Python code attached. First step is to print csv data to console and error below occurring.
Syntax errors were occurring but after fixing them this error has began.
data = {}
with open(csvFilePath) as csvFile:
csvReader = csv.DictReader(csvFile)
for csvRow in csvReader:
hmid = csvRow["hmid"]
data[hmid] = csvRow
Console output:
python csvjson.py
Traceback (most recent call last):
File "csvjson.py", line 12, in <module>
hmid = csvRow["hmid"]
KeyError: 'hmid'
Expected Output:
Prints out the CSV data to conole.
The KeyError exception means that the key you are requesting does not exist in that dictionary.
If that column "hmid" does not exist in every row of the csv, consider using the dict.get() method. This will return None if the key does not exist in the dictionary instead of the KeyError.
Alternatively you can catch that KeyError and skip the row. That would look something like this.
data = {}
with open(csvFilePath) as csvFile:
csvReader = csv.DictReader(csvFile)
for csvRow in csvReader:
try:
data[csvRow["hmid"]] = csvRow
except KeyError:
pass
Or check if the key is in the dictionary before proceeding.
data = {}
with open(csvFilePath) as csvFile:
csvReader = csv.DictReader(csvFile)
for csvRow in csvReader:
if "hmid" not in csvRow.keys():
continue
data[csvRow["hmid"]] = csvRow
I made the following file 'test.csv' :
hmid,first_name,last_name,email,gender,passport_number,departure_city,arrival_city,aircraft_type
1,Lotstring,Duobam,anatwick0#samsung.com,Female,7043833787,Changtang,Tours,B737
2,Rover,Red,rr#nowhere.com,Female,7043833787,Changtang,Tours,B737
pasted your code into Python 2.7, and it worked fine. data has two rows.
Maybe your file had an issue with terminators.
CSV files have a BOM (byte order mark) at the beginning of the file, so when you open the file you need to specify encoding='utf-8-sig' on the file open. Here is your code, corrected:
data = {}
with open(csvFilePath, encoding='utf-8-sig') as csvFile:
csvReader = csv.DictReader(csvFile)
for csvRow in csvReader:
hmid = csvRow["hmid"]
data[hmid] = csvRow

Python read CSV file columns and write file name and column name in a csv file

I have many CSV files, need to read all the files in loop and write file name and all the columns (header in row 1) in an output file.
Example
Input csv file 1 (test1.csv)
Id, Name, Age, Location
1, A, 25, India
Input csv file 2 (test2.csv)
Id, ProductName
1, ABC
Outputfile
test1.csv Id
test1.csv Name
test1.csv Age
test1.csv Location
test2.csv Id
test2.csv ProductName
Many thanks for your help.
Update:
This code works fine for this purpose:
import os
import csv
ofile = open('D:\Anuj\Personal\OutputFile/AHS_File_Columns_Info.csv', 'w')
directory = os.path.join('D:\Anuj\Personal\Python')
for root, dirs, files in os.walk(directory):
for file in files:
fullfilepath = directory + "/" + file
with open(fullfilepath,'r') as f:
output = file +','+ f.readline()
ofile.write(output)
clean solution using csv module for reading and writing
open output file and create a csv.writer instance on its handle
open each input file and create a csv.reader instance on their handle
get first row using next on the csv.reader iterator: gets titles as list (with a small post-processing to remove the spaces)
write titles alongside the current filename in a loop
code:
import csv
files=["test1.csv","test2.csv"]
with open("output.tsv","w",newline='') as fw:
cw = csv.writer(fw,delimiter="\t") # output is tab delimited
for filename in files:
with open(filename,'r') as f:
cr = csv.reader(f)
# get title
for column_name in (x.strip() for x in next(cr)):
cw.writerow([filename,column_name])
There are several advantages using csv module, the most important being that quoting & multi-line fields/titles are managed properly.
But I'm not sure I understand you correctly.
import csv
from typing import List
from typing import Tuple
TableType = List[List[str]]
def load_csv_table(file_name: str) -> Tuple[List[str], TableType]:
with open(file_name) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
headers = next(csv_reader)
data_table = list(csv_reader)
return headers, data_table
def save_csv_table(file_name: str, headers: List[str], data_table: TableType):
with open(file_name, 'w', newline='') as csv_file:
writer = csv.writer(csv_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
writer.writerow(headers)
for row in data_table:
writer.writerow(row)
input_files = ['file1.csv', 'file2.csv', 'file3.csv']
new_table = []
new_headers = []
for file_name in input_files:
headers, data_table = load_csv_table(file_name)
if not new_headers:
new_headers = ['Source'] + headers
new_table.extend(([file_name] + line for line in data_table))
save_csv_table('output.csv', new_headers, new_table)
A simple method is to use readline() on the file object:
files=["test1.csv","test2.csv"]
for my_file in files:
with open(my_file,'r') as f:
print my_file, f.readline()

How can I check if the first line of a CSV file has been written to?

So I'm running this Python script daily, but I want it to check if the header line is already written, write it if it isn't, and skip it if it is. I've tried doing things like reading the first line and setting a variable if there's input, but it hasn't worked. Here's my code:
def addDomainsToFile(domainList):
date = time.strftime("%d:%m:%Y")
fileName = 'MagDomains%s.csv' % date
#Create file with the date as the name, this should be a week to week file, check if day is monday, if so,
with open(fileName, 'ab+') as c:
writer = csv.writer(c ,dialect= 'excel', delimiter= ',')
for row in fileName:
print row
writer.writerow(['domain','ip','identifier','relationship', 'related To'])
for item in domainList:
writer.writerow([item, '', 'Related'])
How about checking if the file size of the csv is greater than zero?
Should be enough for a rudimentary check:
import os
if os.path.getsize(fileName) == 0:
write_header()
You can read the first row of your csv using csv.reader and the next function, and compare with your first row:
with open(fileName, 'ab+') as c:
writer = csv.writer(c, dialect= 'excel', delimiter = ',')
try :
first_row = next(csv.reader(c, dialect = 'excel', delimiter = ','))
for item in domainList:
writer.writerow([item, '', 'Related'])
except StopIteration :
writer.writerow(['domain', 'ip', 'identifier', 'relationship', 'related To'])
for item in domainList:
writer.writerow([item, '', 'Related'])
See if csv.Sniffer.has_header works for you.
https://docs.python.org/2/library/csv.html#csv.Sniffer
I needed to do this too and had to make some changes to Kasramvd's solution in order to make it work.
When you use 'a+' mode, the file pointer is at the end. So you have to jump to the beginning of the file to read the first line.
After reading the header (if there is one), you can jump back to the end to append to the file.
with open(filename, 'a+') as f: # Just use 'w' mode in 3.x
logger.info("Opened file")
f.seek(0) # jump to the beginning of the file
try:
header = csv.reader(f).next()
dict_writer = csv.DictWriter(f, header) # header found
except StopIteration: # no header found
dict_writer = csv.DictWriter(f, my_dict.keys())
dict_writer.writeheader()
f.seek(0,2) # jump back to the end of the file
try:
dict_writer.writerow(my_dict)
except ValueError:
# some keys from my_dict are not in header

How to convert CSV file to multiline JSON?

Here's my code, really simple stuff...
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
out = json.dumps( [ row for row in reader ] )
jsonfile.write(out)
Declare some field names, the reader uses CSV to read the file, and the filed names to dump the file to a JSON format. Here's the problem...
Each record in the CSV file is on a different row. I want the JSON output to be the same way. The problem is it dumps it all on one giant, long line.
I've tried using something like for line in csvfile: and then running my code below that with reader = csv.DictReader( line, fieldnames) which loops through each line, but it does the entire file on one line, then loops through the entire file on another line... continues until it runs out of lines.
Any suggestions for correcting this?
Edit: To clarify, currently I have: (every record on line 1)
[{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"},{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}]
What I'm looking for: (2 records on 2 lines)
{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"}
{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}
Not each individual field indented/on a separate line, but each record on it's own line.
Some sample input.
"John","Doe","001","Message1"
"George","Washington","002","Message2"
The problem with your desired output is that it is not valid json document,; it's a stream of json documents!
That's okay, if its what you need, but that means that for each document you want in your output, you'll have to call json.dumps.
Since the newline you want separating your documents is not contained in those documents, you're on the hook for supplying it yourself. So we just need to pull the loop out of the call to json.dump and interpose newlines for each document written.
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
json.dump(row, jsonfile)
jsonfile.write('\n')
You can use Pandas DataFrame to achieve this, with the following Example:
import pandas as pd
csv_file = pd.DataFrame(pd.read_csv("path/to/file.csv", sep = ",", header = 0, index_col = False))
csv_file.to_json("/path/to/new/file.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)
import csv
import json
file = 'csv_file_name.csv'
json_file = 'output_file_name.json'
#Read CSV File
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Convert csv data into json
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': '))) #for pretty
f.write(json.dumps(data))
read_CSV(file,json_file)
Documentation of json.dumps()
I took #SingleNegationElimination's response and simplified it into a three-liner that can be used in a pipeline:
import csv
import json
import sys
for row in csv.DictReader(sys.stdin):
json.dump(row, sys.stdout)
sys.stdout.write('\n')
You can try this
import csvmapper
# how does the object look
mapper = csvmapper.DictMapper([
[
{ 'name' : 'FirstName'},
{ 'name' : 'LastName' },
{ 'name' : 'IDNumber', 'type':'int' },
{ 'name' : 'Messages' }
]
])
# parser instance
parser = csvmapper.CSVParser('sample.csv', mapper)
# conversion service
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
Edit:
Simpler approach
import csvmapper
fields = ('FirstName', 'LastName', 'IDNumber', 'Messages')
parser = CSVParser('sample.csv', csvmapper.FieldMapper(fields))
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
I see this is old but I needed the code from SingleNegationElimination however I had issue with the data containing non utf-8 characters. These appeared in fields I was not overly concerned with so I chose to ignore them. However that took some effort. I am new to python so with some trial and error I got it to work. The code is a copy of SingleNegationElimination with the extra handling of utf-8. I tried to do it with https://docs.python.org/2.7/library/csv.html but in the end gave up. The below code worked.
import csv, json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("Scope","Comment","OOS Code","In RMF","Code","Status","Name","Sub Code","CAT","LOB","Description","Owner","Manager","Platform Owner")
reader = csv.DictReader(csvfile , fieldnames)
code = ''
for row in reader:
try:
print('+' + row['Code'])
for key in row:
row[key] = row[key].decode('utf-8', 'ignore').encode('utf-8')
json.dump(row, jsonfile)
jsonfile.write('\n')
except:
print('-' + row['Code'])
raise
Add the indent parameter to json.dumps
data = {'this': ['has', 'some', 'things'],
'in': {'it': 'with', 'some': 'more'}}
print(json.dumps(data, indent=4))
Also note that, you can simply use json.dump with the open jsonfile:
json.dump(data, jsonfile)
Use pandas and the json library:
import pandas as pd
import json
filepath = "inputfile.csv"
output_path = "outputfile.json"
df = pd.read_csv(filepath)
# Create a multiline json
json_list = json.loads(df.to_json(orient = "records"))
with open(output_path, 'w') as f:
for item in json_list:
f.write("%s\n" % item)
How about using Pandas to read the csv file into a DataFrame (pd.read_csv), then manipulating the columns if you want (dropping them or updating values) and finally converting the DataFrame back to JSON (pd.DataFrame.to_json).
Note: I haven't checked how efficient this will be but this is definitely one of the easiest ways to manipulate and convert a large csv to json.
As slight improvement to #MONTYHS answer, iterating through a tup of fieldnames:
import csv
import json
csvfilename = 'filename.csv'
jsonfilename = csvfilename.split('.')[0] + '.json'
csvfile = open(csvfilename, 'r')
jsonfile = open(jsonfilename, 'w')
reader = csv.DictReader(csvfile)
fieldnames = ('FirstName', 'LastName', 'IDNumber', 'Message')
output = []
for each in reader:
row = {}
for field in fieldnames:
row[field] = each[field]
output.append(row)
json.dump(output, jsonfile, indent=2, sort_keys=True)
def read():
noOfElem = 200 # no of data you want to import
csv_file_name = "hashtag_donaldtrump.csv" # csv file name
json_file_name = "hashtag_donaldtrump.json" # json file name
with open(csv_file_name, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
with open(json_file_name, 'w') as json_file:
i = 0
json_file.write("[")
for row in csv_reader:
i = i + 1
if i == noOfElem:
json_file.write("]")
return
json_file.write(json.dumps(row))
if i != noOfElem - 1:
json_file.write(",")
Change the above three parameter, everything will be done.
import csv
import json
csvfile = csv.DictReader('filename.csv', 'r'))
output =[]
for each in csvfile:
row ={}
row['FirstName'] = each['FirstName']
row['LastName'] = each['LastName']
row['IDNumber'] = each ['IDNumber']
row['Message'] = each['Message']
output.append(row)
json.dump(output,open('filename.json','w'),indent=4,sort_keys=False)

Categories

Resources