Write from Text files to CSV Files - python

I am trying to write all the rows that contain the string: from a bunch of Text files. This is my code:
import os
import glob
import csv
import re
#Defining Keyword
keyword = '2012-07-02'
#Code to merge all relevant LOG files into one file and insert
with open('Combined-01022012.txt' , 'w', newline = '') as combined_file:
csv_output = csv.writer(combined_file)
for filename in glob.glob('FAO_Agg_2012_Part_*.txt'):
with open(filename, 'rt', newline = '') as f_input:
#with gzip.open((filename.split('.')[0]) + '.gz', 'rt', newline='') as f_input:
csv_input = csv.reader(f_input)
for row in csv_input:
row.insert(0, os.path.basename(filename))
try:
if keyword in row[2]:
csv_output.writerow(row)
#row.insert(0, os.path.basename(filename))
#csv_output.writerow(row)
except:
continue
continue
Everything seems to be right and the code runs but nothing gets written on to my text file. What could be going wrong?

Your main problem is in the lines:
row.insert(0, os.path.basename(filename))
try:
if keyword in row[0]:
csv_output.writerow(row)
except:
continue
You're essentially inserting your parent folder name of the current file as the first entry of your row, and then on the very next line you're checking if that entry (row[0]) contains your keyword. Unless the parent folder contains your keyword (2012-07-02) that condition will never evaluate as True. I'd mix this up as:
if keyword in row[0]:
csv_output.writerow([os.path.basename(filename)] + row)
Also, using blank except is a very, very bad idea. If you're looking to capture a specific exception, define it in your except clause.

Related

Why is this code still adding blank lines between data in a csv file?

The following code is intended to remove tags from a csv file. Tags are labelled as follows (the tag here is hot as an example): tag,hot
It iterates through the current csv file, writing every line to a temporary file except the tag that needs to be removed, then renames the temporary file on top of the original. Currently, it adds whitespace between every line in the original, eg:
tag,tag1
tag,tag2
tag,tag3
tag,tag4
tag,tag5
goes to the following if tag3 was passed as the tag to be removed:
tag,tag1
tag,tag2
tag,tag4
tag,tag5
As can be seen an extra line is where tag3 would have been, as well as there being blank lines between every other line. I have seen other threads that suggested adding newline='' to when the files are opened, but I have done this and it still does not solve the problem (although there was actually more whitespace before, with 2 lines between every value rather than 1). Not sure why this hasn't fixed it.
Code:
tempfile = NamedTemporaryFile(mode="w", delete=False, newline='')
CSVpart = imageNumber + ".csv"
lowercase = tag.lower()
filePath = os.path.join(os.curdir, "static", "csv", CSVpart)
if os.path.isfile(filePath):
with open(filePath, 'r', newline='') as csvfile, tempfile:
reader = csv.reader(csvfile, delimiter=',')
writer = csv.writer(tempfile)
for row in reader:
if row != ['tag', lowercase]:
writer.writerow(row)
else:
removed = True //ignore, this makes sense in context
shutil.move(tempfile.name, filePath)
I created the following, which works. Cannot figure out what is different about it:
import shutil, csv, os
from tempfile import NamedTemporaryFile
tempfile = NamedTemporaryFile(mode="w", delete=False, newline='')
tagToRemove = "hot"
filePath = os.path.join(os.curdir, "test.csv")
with open(filePath, 'r', newline='') as csvfile, tempfile:
reader = csv.reader(csvfile, delimiter=',')
writer = csv.writer(tempfile)
for row in reader:
if row != ['tag', tagToRemove]:
writer.writerow(row)
shutil.move(tempfile.name, filePath)

Django import CSV with carriage return

I'm creating a Django app and I need to import several *.csv files.
One's of this file has this structure:
id|value (header)
12|¤this is the
value¤
34|¤this is another
value¤
I use this code for parse the file:
try:
csvfile = open(path, "r", encoding='utf-16')
except IOError:
return False
cursor.copy_from(csvfile , tblname, columns=['id', 'value'], sep='|')
But when I try to parse this file, it gave me this error:
psycopg2.DataError: ERROR: missing data for the column "value"
Is there a way to parse this file keeping carriage return inside text identifier ('¤')?
You could use Pythons csv module for reading that.
import csv
try:
csvfile = open(path, newline='')
except IOError:
return False
csvreader = csv.reader(csvfile, delimiter='|', quotechar='¤')
for row in csvreader:
print(', '.join(row)) # or do something else with the row of data.
One approach would be to build up the entries yourself as follows:
blocks = []
block = []
with open('input.csv') as f_input:
for row in f_input:
if '|' in row:
if len(block):
blocks.append(''.join(block).strip('\n').split('|'))
block = []
block.append(row)
else:
block.append(row)
if len(block):
blocks.append(''.join(block).strip('\n').split('|'))
print(blocks)
This would produce a list of blocks as follows:
[['id', 'value (header)'], ['12', '¤this is the\nvalue¤'], ['34', '¤this is another\nvalue¤']]

How to divide csv file with condition?

I have this csv file:
89,Network activity,ip-dst,80.179.42.44,,1,20160929
89,Payload delivery,md5,4ad2924ced722ab65ff978f83a40448e,,1,20160929
89,Network activity,domain,alkamaihd.net,,1,20160929
90,Payload delivery,md5,197c018922237828683783654d3c632a,,1,20160929
90,Network activity,domain,dnsrecordsolver.tk,,1,20160929
90,Network activity,ip-dst,178.33.94.47,,1,20160929
90,Payload delivery,filename,Airline.xls,,1,20160929
91,Payload delivery,md5,23a9bbf8d64ae893db17777bedccdc05,,1,20160929
91,Payload delivery,md5,07e47f06c5ed05a062e674f8d11b01d8,,1,20160929
91,Payload delivery,md5,bd75af219f417413a4e0fae8cd89febd,,1,20160929
91,Payload delivery,md5,9f4023f2aefc8c4c261bfdd4bd911952,,1,20160929
91,Network activity,domain,mailsinfo.net,,1,20160929
91,Payload delivery,md5,1e4653631feebf507faeb9406664792f,,1,20160929
92,Payload delivery,md5,6fa869f17b703a1282b8f386d0d87bd4,,1,20160929
92,Payload delivery,md5,24befa319fd96dea587f82eb945f5d2a,,1,20160929
I need to divide this csv file to 4 csv files where as the condition is the event number at the beginning of every row. so far I created a set that includes al the event numbers {89,90,91,92}, and I know that I need to make loop in a loop and copy each row to its dedicated csv file.
data = {
'89': [],
'90': [],
'91': [],
'92': []
}
with open('yourfile.csv') as infile:
for line in infile:
prefix = line[:2]
data[prefix].append(line)
for prefix in data.keys():
with open('csv' + prefix + '.csv', 'w') as csv:
csv.writelines(''.join(data[prefix]))
However if your are open to solutions other than python then this can be easily accomplished by running four commands
grep ^89 file.csv > 89.csv
grep ^90 file.csv > 90.csv
Similarly for other values.
It would be best to not hardcode the event numbers in your code so it's not dependent on the values of the data. I also prefer to use the csv module which has been optimized to read and write .csv files.
Here's a way to do that:
import csv
prefix = 'events' # of output csv file names
data = {}
with open('conditions.csv', 'rb') as conditions:
reader = csv.reader(conditions)
for row in reader:
data.setdefault(row[0], []).append(row)
for event in sorted(data):
csv_filename = '{}_{}.csv'.format(prefix, event)
print(csv_filename)
with open(csv_filename, 'wb') as csvfile:
writer = csv.writer(csvfile)
writer.writerows(data[event])
Update
The approach implemented above first reads the entire csv file into memory, and then writes the all the rows associated with each event value into a separate output file, one at a time.
A more memory-efficient approach would be to open multiple output files simultaneously and write each row immediately after it has been read out to the proper destination file. Doing this requires keeping track of what files are already open. Something else the file managing code needs to do is make sure all the files are closed when processing is complete.
In the code below all of this has been accomplished by defining and using a Python Context Manager type to centralize the handling of all the csv output files that might be generated depending on how many different event values there are in the input file.
import csv
import sys
PY3 = sys.version_info.major > 2
class MultiCSVOutputFileManager(object):
"""Context manager to open and close multiple csv files and csv writers.
"""
def __enter__(self):
self.files = {}
return self
def __exit__(self, exc_type, exc_value, traceback):
for file, csv_writer in self.files.values():
print('closing file: {}'.format(file.name))
file.close()
self.files.clear()
return None
def get_csv_writer(self, filename):
if filename not in self.files: # new file?
open_kwargs = dict(mode='w', newline='') if PY3 else dict(mode='wb')
print('opening file: {}'.format(filename))
file = open(filename, **open_kwargs)
self.files[filename] = file, csv.writer(file)
return self.files[filename][1] # return associated csv.writer object
And here's how to use it:
prefix = 'events' # to name of each csv output file
with open('conditions.csv', 'rb') as conditions:
reader = csv.reader(conditions)
with MultiCSVOutputFileManager() as file_manager:
for row in reader:
csv_filename = '{}_{}.csv'.format(prefix, row[0]) # row[0] is event
writer = file_manager.get_csv_writer(csv_filename)
writer.writerow(row)
You can even dynamically create the resulting files if the first field has not been encountered by keeping a mapping of that id and the associated file:
files = {}
with open('file.csv') as fd:
for line in fd:
if 0 == len(line.strip()): continue # skip empty lines
try:
id_field = line.split(',', 1)[0] # extract first field
if not id in files.keys(): # if not encountered open a new result file
files[id] = open(id + '.csv')
files[id].write(line) # write the line in proper file
except Exception as e:
print('ERR', line, e) # catchall in case of problems...

How can I check if the first line of a CSV file has been written to?

So I'm running this Python script daily, but I want it to check if the header line is already written, write it if it isn't, and skip it if it is. I've tried doing things like reading the first line and setting a variable if there's input, but it hasn't worked. Here's my code:
def addDomainsToFile(domainList):
date = time.strftime("%d:%m:%Y")
fileName = 'MagDomains%s.csv' % date
#Create file with the date as the name, this should be a week to week file, check if day is monday, if so,
with open(fileName, 'ab+') as c:
writer = csv.writer(c ,dialect= 'excel', delimiter= ',')
for row in fileName:
print row
writer.writerow(['domain','ip','identifier','relationship', 'related To'])
for item in domainList:
writer.writerow([item, '', 'Related'])
How about checking if the file size of the csv is greater than zero?
Should be enough for a rudimentary check:
import os
if os.path.getsize(fileName) == 0:
write_header()
You can read the first row of your csv using csv.reader and the next function, and compare with your first row:
with open(fileName, 'ab+') as c:
writer = csv.writer(c, dialect= 'excel', delimiter = ',')
try :
first_row = next(csv.reader(c, dialect = 'excel', delimiter = ','))
for item in domainList:
writer.writerow([item, '', 'Related'])
except StopIteration :
writer.writerow(['domain', 'ip', 'identifier', 'relationship', 'related To'])
for item in domainList:
writer.writerow([item, '', 'Related'])
See if csv.Sniffer.has_header works for you.
https://docs.python.org/2/library/csv.html#csv.Sniffer
I needed to do this too and had to make some changes to Kasramvd's solution in order to make it work.
When you use 'a+' mode, the file pointer is at the end. So you have to jump to the beginning of the file to read the first line.
After reading the header (if there is one), you can jump back to the end to append to the file.
with open(filename, 'a+') as f: # Just use 'w' mode in 3.x
logger.info("Opened file")
f.seek(0) # jump to the beginning of the file
try:
header = csv.reader(f).next()
dict_writer = csv.DictWriter(f, header) # header found
except StopIteration: # no header found
dict_writer = csv.DictWriter(f, my_dict.keys())
dict_writer.writeheader()
f.seek(0,2) # jump back to the end of the file
try:
dict_writer.writerow(my_dict)
except ValueError:
# some keys from my_dict are not in header

how do i get fix this so it changes the entire font case of a csv to lower?

I keep getting "NameError: name '[file name]' is not defined" after entering "process_csv(b.csv, b2.csv)" in IDLE. i'm not sure what to fix. any ideas?
import csv
def process_csv(file_name, new_file_name):
'''Reads from external CSV file.
puts each number in lowercase.
Writes values to new CSV file.
'''
file1 = open(file_name, 'r')
file2 = open(new_file_name, 'w', newline='')
data = csv.reader(file1)
new_data = csv.writer(file2)
for line in data:
row = []
for item in line:
if item.isalpha():
item = item.lower()
row.append(item)
new_data.writerow(row)
file1.close()
file2.close()
print('File processed.')
You need quotation marks around the file names:
"b.csv" and "b2.csv"

Categories

Resources