I am trying to write all the rows that contain the string: from a bunch of Text files. This is my code:
import os
import glob
import csv
import re
#Defining Keyword
keyword = '2012-07-02'
#Code to merge all relevant LOG files into one file and insert
with open('Combined-01022012.txt' , 'w', newline = '') as combined_file:
csv_output = csv.writer(combined_file)
for filename in glob.glob('FAO_Agg_2012_Part_*.txt'):
with open(filename, 'rt', newline = '') as f_input:
#with gzip.open((filename.split('.')[0]) + '.gz', 'rt', newline='') as f_input:
csv_input = csv.reader(f_input)
for row in csv_input:
row.insert(0, os.path.basename(filename))
try:
if keyword in row[2]:
csv_output.writerow(row)
#row.insert(0, os.path.basename(filename))
#csv_output.writerow(row)
except:
continue
continue
Everything seems to be right and the code runs but nothing gets written on to my text file. What could be going wrong?
Your main problem is in the lines:
row.insert(0, os.path.basename(filename))
try:
if keyword in row[0]:
csv_output.writerow(row)
except:
continue
You're essentially inserting your parent folder name of the current file as the first entry of your row, and then on the very next line you're checking if that entry (row[0]) contains your keyword. Unless the parent folder contains your keyword (2012-07-02) that condition will never evaluate as True. I'd mix this up as:
if keyword in row[0]:
csv_output.writerow([os.path.basename(filename)] + row)
Also, using blank except is a very, very bad idea. If you're looking to capture a specific exception, define it in your except clause.
I'm creating a Django app and I need to import several *.csv files.
One's of this file has this structure:
id|value (header)
12|¤this is the
value¤
34|¤this is another
value¤
I use this code for parse the file:
try:
csvfile = open(path, "r", encoding='utf-16')
except IOError:
return False
cursor.copy_from(csvfile , tblname, columns=['id', 'value'], sep='|')
But when I try to parse this file, it gave me this error:
psycopg2.DataError: ERROR: missing data for the column "value"
Is there a way to parse this file keeping carriage return inside text identifier ('¤')?
You could use Pythons csv module for reading that.
import csv
try:
csvfile = open(path, newline='')
except IOError:
return False
csvreader = csv.reader(csvfile, delimiter='|', quotechar='¤')
for row in csvreader:
print(', '.join(row)) # or do something else with the row of data.
One approach would be to build up the entries yourself as follows:
blocks = []
block = []
with open('input.csv') as f_input:
for row in f_input:
if '|' in row:
if len(block):
blocks.append(''.join(block).strip('\n').split('|'))
block = []
block.append(row)
else:
block.append(row)
if len(block):
blocks.append(''.join(block).strip('\n').split('|'))
print(blocks)
This would produce a list of blocks as follows:
[['id', 'value (header)'], ['12', '¤this is the\nvalue¤'], ['34', '¤this is another\nvalue¤']]
Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()
I have a text file consisting of 100 records like
fname,lname,subj1,marks1,subj2,marks2,subj3,marks3.
I need to extract and print lname and marks1+marks2+marks3 in python. How do I do that?
I am a beginner in python.
Please help
When I used split, i got an error saying
TypeError: Can't convert 'type' object to str implicitly.
The code was
import sys
file_name = sys.argv[1]
file = open(file_name, 'r')
for line in file:
fname = str.split(str=",", num=line.count(str))
print fname
If you want to do it that way, you were close. Is this what you were trying?
file = open(file_name, 'r')
for line in file.readlines():
fname = line.rstrip().split(',') #using rstrip to remove the \n
print fname
Note: its not a tested code. but it tries to solve your problem. Please give it a try
import csv
with open(file_name, 'rb') as csvfile:
marksReader = csv.reader(csvfile)
for row in marksReader:
if len(row) < 8: # 8 is the number of columns in your file.
# row has some missing columns or empty
continue
# Unpack columns of row; you can also do like fname = row[0] and lname = row[1] and so on ...
(fname,lname,subj1,marks1,subj2,marks2,subj3,marks3) = *row
# you can use float in place of int if marks contains decimals
totalMarks = int(marks1) + int(marks2) + int(marks3)
print '%s %s scored: %s'%(fname, lname, totalMarks)
print 'End.'
"""
sample file content
poohpool#signet.com; meixin_kok#hotmail.com; ngai_nicole#hotmail.com; isabelle_gal#hotmail.com; michelle-878#hotmail.com;
valerietan98#gmail.com; remuskan#hotmail.com; genevieve.goh#hotmail.com; poonzheng5798#yahoo.com; burgergirl96#hotmail.com;
insyirah_powergals#hotmail.com; little_princess-angel#hotmail.com; ifah_duff#hotmail.com; tweety_butt#hotmail.com;
choco_ela#hotmail.com; princessdyanah#hotmail.com;
"""
import pandas as pd
file = open('emaildump.txt', 'r')
for line in file.readlines():
fname = line.split(';') #using split to form a list
#print(fname)
df1 = pd.DataFrame(fname,columns=['Email'])
print(df1)
Here's my code, really simple stuff...
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
out = json.dumps( [ row for row in reader ] )
jsonfile.write(out)
Declare some field names, the reader uses CSV to read the file, and the filed names to dump the file to a JSON format. Here's the problem...
Each record in the CSV file is on a different row. I want the JSON output to be the same way. The problem is it dumps it all on one giant, long line.
I've tried using something like for line in csvfile: and then running my code below that with reader = csv.DictReader( line, fieldnames) which loops through each line, but it does the entire file on one line, then loops through the entire file on another line... continues until it runs out of lines.
Any suggestions for correcting this?
Edit: To clarify, currently I have: (every record on line 1)
[{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"},{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}]
What I'm looking for: (2 records on 2 lines)
{"FirstName":"John","LastName":"Doe","IDNumber":"123","Message":"None"}
{"FirstName":"George","LastName":"Washington","IDNumber":"001","Message":"Something"}
Not each individual field indented/on a separate line, but each record on it's own line.
Some sample input.
"John","Doe","001","Message1"
"George","Washington","002","Message2"
The problem with your desired output is that it is not valid json document,; it's a stream of json documents!
That's okay, if its what you need, but that means that for each document you want in your output, you'll have to call json.dumps.
Since the newline you want separating your documents is not contained in those documents, you're on the hook for supplying it yourself. So we just need to pull the loop out of the call to json.dump and interpose newlines for each document written.
import csv
import json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("FirstName","LastName","IDNumber","Message")
reader = csv.DictReader( csvfile, fieldnames)
for row in reader:
json.dump(row, jsonfile)
jsonfile.write('\n')
You can use Pandas DataFrame to achieve this, with the following Example:
import pandas as pd
csv_file = pd.DataFrame(pd.read_csv("path/to/file.csv", sep = ",", header = 0, index_col = False))
csv_file.to_json("/path/to/new/file.json", orient = "records", date_format = "epoch", double_precision = 10, force_ascii = True, date_unit = "ms", default_handler = None)
import csv
import json
file = 'csv_file_name.csv'
json_file = 'output_file_name.json'
#Read CSV File
def read_CSV(file, json_file):
csv_rows = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
field = reader.fieldnames
for row in reader:
csv_rows.extend([{field[i]:row[field[i]] for i in range(len(field))}])
convert_write_json(csv_rows, json_file)
#Convert csv data into json
def convert_write_json(data, json_file):
with open(json_file, "w") as f:
f.write(json.dumps(data, sort_keys=False, indent=4, separators=(',', ': '))) #for pretty
f.write(json.dumps(data))
read_CSV(file,json_file)
Documentation of json.dumps()
I took #SingleNegationElimination's response and simplified it into a three-liner that can be used in a pipeline:
import csv
import json
import sys
for row in csv.DictReader(sys.stdin):
json.dump(row, sys.stdout)
sys.stdout.write('\n')
You can try this
import csvmapper
# how does the object look
mapper = csvmapper.DictMapper([
[
{ 'name' : 'FirstName'},
{ 'name' : 'LastName' },
{ 'name' : 'IDNumber', 'type':'int' },
{ 'name' : 'Messages' }
]
])
# parser instance
parser = csvmapper.CSVParser('sample.csv', mapper)
# conversion service
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
Edit:
Simpler approach
import csvmapper
fields = ('FirstName', 'LastName', 'IDNumber', 'Messages')
parser = CSVParser('sample.csv', csvmapper.FieldMapper(fields))
converter = csvmapper.JSONConverter(parser)
print converter.doConvert(pretty=True)
I see this is old but I needed the code from SingleNegationElimination however I had issue with the data containing non utf-8 characters. These appeared in fields I was not overly concerned with so I chose to ignore them. However that took some effort. I am new to python so with some trial and error I got it to work. The code is a copy of SingleNegationElimination with the extra handling of utf-8. I tried to do it with https://docs.python.org/2.7/library/csv.html but in the end gave up. The below code worked.
import csv, json
csvfile = open('file.csv', 'r')
jsonfile = open('file.json', 'w')
fieldnames = ("Scope","Comment","OOS Code","In RMF","Code","Status","Name","Sub Code","CAT","LOB","Description","Owner","Manager","Platform Owner")
reader = csv.DictReader(csvfile , fieldnames)
code = ''
for row in reader:
try:
print('+' + row['Code'])
for key in row:
row[key] = row[key].decode('utf-8', 'ignore').encode('utf-8')
json.dump(row, jsonfile)
jsonfile.write('\n')
except:
print('-' + row['Code'])
raise
Add the indent parameter to json.dumps
data = {'this': ['has', 'some', 'things'],
'in': {'it': 'with', 'some': 'more'}}
print(json.dumps(data, indent=4))
Also note that, you can simply use json.dump with the open jsonfile:
json.dump(data, jsonfile)
Use pandas and the json library:
import pandas as pd
import json
filepath = "inputfile.csv"
output_path = "outputfile.json"
df = pd.read_csv(filepath)
# Create a multiline json
json_list = json.loads(df.to_json(orient = "records"))
with open(output_path, 'w') as f:
for item in json_list:
f.write("%s\n" % item)
How about using Pandas to read the csv file into a DataFrame (pd.read_csv), then manipulating the columns if you want (dropping them or updating values) and finally converting the DataFrame back to JSON (pd.DataFrame.to_json).
Note: I haven't checked how efficient this will be but this is definitely one of the easiest ways to manipulate and convert a large csv to json.
As slight improvement to #MONTYHS answer, iterating through a tup of fieldnames:
import csv
import json
csvfilename = 'filename.csv'
jsonfilename = csvfilename.split('.')[0] + '.json'
csvfile = open(csvfilename, 'r')
jsonfile = open(jsonfilename, 'w')
reader = csv.DictReader(csvfile)
fieldnames = ('FirstName', 'LastName', 'IDNumber', 'Message')
output = []
for each in reader:
row = {}
for field in fieldnames:
row[field] = each[field]
output.append(row)
json.dump(output, jsonfile, indent=2, sort_keys=True)
def read():
noOfElem = 200 # no of data you want to import
csv_file_name = "hashtag_donaldtrump.csv" # csv file name
json_file_name = "hashtag_donaldtrump.json" # json file name
with open(csv_file_name, mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
with open(json_file_name, 'w') as json_file:
i = 0
json_file.write("[")
for row in csv_reader:
i = i + 1
if i == noOfElem:
json_file.write("]")
return
json_file.write(json.dumps(row))
if i != noOfElem - 1:
json_file.write(",")
Change the above three parameter, everything will be done.
import csv
import json
csvfile = csv.DictReader('filename.csv', 'r'))
output =[]
for each in csvfile:
row ={}
row['FirstName'] = each['FirstName']
row['LastName'] = each['LastName']
row['IDNumber'] = each ['IDNumber']
row['Message'] = each['Message']
output.append(row)
json.dump(output,open('filename.json','w'),indent=4,sort_keys=False)