I am working on a text file right now that is called "dracula.txt", and I have to do the following in python:
Save words that occur no more than 3 times in descending order in a file called less_common_words.txt. Each word with its count should be saved on a separate line.
I would appreciate any help! I've been working on this for too long.
I have already tokenized my file and counted the words. This is my code so far:
file = open("C:/Users/17733/Downloads/dracula.txt", 'r', encoding = 'utf-8-sig')
data = file.read()
data
data_list = data.split('\n')
data_list
new_list = []
for i in data_list:
if i !='':
ans_here = i.split(' ')
new_list.extend(ans_here)
new_list
import string
import re
puncs = list(string.punctuation)
puncs.append('"')
puncs.append('[')
puncs.append('.')
puncs.append('-')
puncs.append('_')
#append each seperately
new_2 = []
for i in new_list:
for p in puncs:
if p in i:
i_new = i.replace(p, ' ')
new_2.append(i_new)
new_2
new_2 = [i.replace(' ', ' ').strip().lower() for i in new_2]
new_2
from pathlib import Path
from collections import Counter
import string
filepath = Path('test.txt')
output_filepath = Path('outfile.txt')
# print(filepath.exists())
with open(filepath) as f:
content = f.readlines()
word_list = sum((
(s.lower().strip('\n').translate(str.maketrans('', '', string.punctuation))).split(' ')
for s in content
), [])
less_common_words = sorted([
key for key, value in Counter(word_list).items() if value <= 3
],reverse=True)
with open(output_filepath, mode='wt', encoding='utf-8') as myfile:
myfile.write('\n'.join(less_common_words))
This should exactly be what you need- I fixed my previous error by flattening the entire txt into a 2d list:
book_open = open('frankenstein.txt', 'r').readlines()
beauty_book = [i.split() for i in book_open]
flatten = []
for sublist in beauty_book:
for val in sublist:
flatten.append(val)
foo = 0
for i in flatten:
list_open = open('less_common_words.txt', 'r').readlines()
beauty_list = [i.replace('\n', '') for i in list_open]
count = flatten.count(flatten[foo])
compile = str((flatten[foo], count))
if count <= 3:
if compile not in beauty_list:
file = open('less_common_words.txt', 'a+')
file.write('\n'+compile)
file.close()
foo += 1
So basically i want to iterate the lines of a text file that has this format:
-----------------------------------------
Code: 0123456789
EGGS: 3 7.00 21.00
BACON: 1 3.50 3.50
COFFEE: 2 14.20 28.40
TOTAL: 52.90
-----------------------------------------
and i have the following code to read the lines one by one:
with open(filename, "rt", encoding="utf-8") as f:
for line in f:
prevline = line
line.split()
if '-' in line:
temp = f.readline().split(':') #Get Code
print(temp)
AFM = temp[1]
print(AFM)
else:
tempProducts = line.split(':') #Get Product in a list
productName = tempProducts[0] #Store Product Name in a variable
productStats = tempProducts[1] #Store Product Stats in a list
productStats = productStats.split(" ")
for value in productStats:
valueArray.append(float(value))
products.update({productName:valueArray})
if '-' in f.readline():
rec = Receipt(AFM,products)
products={}
valueArray=[]
receipts.append(rec)
else:
line=prevline
mind that i want to skip the line with the '------------' characters the code works but it keeps reading second line then fourth then sixth(code,bacon,total). The question is how can i fix this.Edit: there are multiple receipts in the file so i need each time to skip the line with the'----------'.
with open(filename, "rt", encoding="utf-8") as f:
old_list = [] # Saving all the lines including '----'
for line in f:
old_list.append(line)
new_list = old_list[1:-1] # new list which removes the '----' lines
You can iterate just through new_list with your .split logic.
See if this does the job
with open(filename, "rt", encoding="utf-8") as f:
valueArray = []
for line in f:
if not '-' in line:
if 'Code' in line:
AFM = line.split(':')[1]
print(AFM)
valueArray = []
products = {}
else:
tempProducts = line.split(':') # Get Product in a list
productName = tempProducts[0] # Store Product Name in a variable
productStats = tempProducts[1] # Store Product Stats in a list
productStats_list = productStats.split(" ")
for value in productStats:
valueArray.append(float(value))
products.update({productName: valueArray})
if 'TOTAL' in line:
rec = Receipt(AFM, products)
receipts.append(rec)
To anyone seeing this post now consider it closed i do not provide enough information and the code was messed up. Sorry for wasting your time
When reading a CSV into a list and then trying to write back directly to that same CSV with some modifications I am finding that the script skips to the next file prematurely at the exact same point every time.
Changing the outputfile to a secondary file (.e. filea.csv is read and the write is fileaCorrected.csv) works.
import time
import datetime
import csv
import sys
import glob
import pdb
pattern1 = '%m/%d/%Y %H:%M'
pattern2 = '%m/%d/%Y %H:%M:%S'
pattern3 = '%Y-%m-%d %H:%M:%S'
folderLocation = input("which Folder should be scanned and trimmed EX C:\program files\data:")
endDate = input("what is the last timestamp that should be in the file(s): ")
trimDate = int(time.mktime(time.strptime(endDate, pattern1)))
fileList = sorted(glob.glob(folderLocation+'/*.csv'))
for FileName in fileList:
removedLines = 0
FilesComplete = 0
f = open(FileName)
csv_f = csv.reader(f)
#pdb.set_trace()
header = next(f)
newFileName = (FileName[:-4]) + " endateremoved.csv"
with open(FileName, 'w') as csvfile:
filewriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
csvfile.write(header)
for row in csv_f:
#(FileName," -- ",row[0])
date_time = row[0]
epoch = int(time.mktime(time.strptime(date_time, pattern3)))
if epoch < trimDate:
lineWriter = ""
for item in row[:-1]:
lineWriter += item + ","
lineWriter += row[-1]
#print(lineWriter)
csvfile.write(lineWriter + "\n")
else:
#print("removed line %s" % row)
removedLines += 1
break
FilesComplete += 1
print (str(FilesComplete) + " Files Completed")
print("%d files had removed lines" % removedLines)'
I feel as though I am making a minor mistake in the script that is causing the file to end prematurely.
example
input
car,a
boat,b
plane,c
output
car,a
boat,b
pla
I could create a workaround that delete the old files and rename the new ones but that seems janky? Any thoughts on this are appreciated.
I have a csv file of about 5000 rows in python i want to split it into five files.
I wrote a code for it but it is not working
import codecs
import csv
NO_OF_LINES_PER_FILE = 1000
def again(count_file_header,count):
f3 = open('write_'+count_file_header+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
co = 0
for row in candidate_info_reader:
co = co + 1
count = count + 1
if count <= count:
pass
elif count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
def read_write():
f3 = open('write_'+NO_OF_LINES_PER_FILE+'.csv', 'at')
with open('import_1458922827.csv', 'rb') as csvfile:
candidate_info_reader = csv.reader(csvfile, delimiter=',', quoting=csv.QUOTE_ALL)
count = 0
for row in candidate_info_reader:
count = count + 1
if count >= NO_OF_LINES_PER_FILE:
count_file_header = count + NO_OF_LINES_PER_FILE
again(count_file_header,count)
else:
writer = csv.writer(f3,delimiter = ',', lineterminator='\n',quoting=csv.QUOTE_ALL)
writer.writerow(row)
read_write()
The above code creates many fileswith empty content.
How to split one files into five csv files?
In Python
Use readlines() and writelines() to do that, here is an example:
>>> csvfile = open('import_1458922827.csv', 'r').readlines()
>>> filename = 1
>>> for i in range(len(csvfile)):
... if i % 1000 == 0:
... open(str(filename) + '.csv', 'w+').writelines(csvfile[i:i+1000])
... filename += 1
the output file names will be numbered 1.csv, 2.csv, ... etc.
From terminal
FYI, you can do this from the command line using split as follows:
$ split -l 1000 import_1458922827.csv
I suggest you not inventing a wheel. There is existing solution. Source here
import os
def split(filehandler, delimiter=',', row_limit=1000,
output_name_template='output_%s.csv', output_path='.', keep_headers=True):
import csv
reader = csv.reader(filehandler, delimiter=delimiter)
current_piece = 1
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
current_limit = row_limit
if keep_headers:
headers = reader.next()
current_out_writer.writerow(headers)
for i, row in enumerate(reader):
if i + 1 > current_limit:
current_piece += 1
current_limit = row_limit * current_piece
current_out_path = os.path.join(
output_path,
output_name_template % current_piece
)
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=delimiter)
if keep_headers:
current_out_writer.writerow(headers)
current_out_writer.writerow(row)
Use it like:
split(open('/your/pat/input.csv', 'r'));
A python3-friendly solution:
def split_csv(source_filepath, dest_folder, split_file_prefix,
records_per_file):
"""
Split a source csv into multiple csvs of equal numbers of records,
except the last file.
Includes the initial header row in each split file.
Split files follow a zero-index sequential naming convention like so:
`{split_file_prefix}_0.csv`
"""
if records_per_file <= 0:
raise Exception('records_per_file must be > 0')
with open(source_filepath, 'r') as source:
reader = csv.reader(source)
headers = next(reader)
file_idx = 0
records_exist = True
while records_exist:
i = 0
target_filename = f'{split_file_prefix}_{file_idx}.csv'
target_filepath = os.path.join(dest_folder, target_filename)
with open(target_filepath, 'w') as target:
writer = csv.writer(target)
while i < records_per_file:
if i == 0:
writer.writerow(headers)
try:
writer.writerow(next(reader))
i += 1
except StopIteration:
records_exist = False
break
if i == 0:
# we only wrote the header, so delete that file
os.remove(target_filepath)
file_idx += 1
I have modified the accepted answer a little bit to make it simpler
Edited: Added the import statement, modified the print statement for printing the exception. #Alex F code snippet was written for python2, for python3 you also need to use header_row = rows.__next__() instead header_row = rows.next(). Thanks for pointing out.
import os
import csv
def split_csv_into_chunks(file_location, out_dir, file_size=2):
count = 0
current_piece = 1
# file_to_split_name.csv
file_name = file_location.split("/")[-1].split(".")[0]
split_file_name_template = file_name + "__%s.csv"
splited_files_path = []
if not os.path.exists(out_dir):
os.makedirs(out_dir)
try:
with open(file_location, "rb") as csv_file:
rows = csv.reader(csv_file, delimiter=",")
headers_row = rows.next()
for row in rows:
if count % file_size == 0:
current_out_path = os.path.join(out_dir,
split_file_name_template%str(current_piece))
current_out_writer = None
current_out_writer = csv.writer(open(current_out_path, 'w'), delimiter=",")
current_out_writer.writerow(headers_row)
splited_files_path.append(current_out_path)
current_piece += 1
current_out_writer.writerow(row)
count += 1
return True, splited_files_path
except Exception as e:
print("Exception occurred as {}".format(e))
return False, splited_files_path
#Ryan, Python3 code worked for me. I used newline='' as below to avoid the blank line issue:
with open(target_filepath, 'w', newline='') as target:
Another pandas solution (each 1000 rows), similar to Aziz Alto solution:
suffix = 1
for i in range(len(df)):
if i % 1000 == 0:
df[i:i+1000].to_csv(f"processed/{filename}_{suffix}.csv", sep ='|', index=False, index_label=False)
suffix += 1
where df is the csv loaded as pandas.DataFrame; filename is the original filename, the pipe is a separator; index and index_label false is to skip the autoincremented index columns
A simple Python 3 solution with Pandas that doesn't cut off the last batch
def to_csv_batch(src_csv, dst_dir, size=30000, index=False):
import pandas as pd
import math
# Read source csv
df = pd.read_csv(src_csv)
# Initial values
low = 0
high = size
# Loop through batches
for i in range(math.ceil(len(df) / size)):
fname = dst_dir+'/Batch_' + str(i+1) + '.csv'
df[low:high].to_csv(fname, index=index)
# Update selection
low = high
if (high + size < len(df)):
high = high + size
else:
high = len(df)
Usage example
to_csv_batch('Batch_All.csv', 'Batches')
if count <= count:
pass
This condition is always true so you pass everytime
Otherwise you can look at this post: Splitting a CSV file into equal parts?
I suggest you leverage the possibilities offered by pandas. Here are functions you could use to do that :
def csv_count_rows(file):
"""
Counts the number of rows in a file.
:param file: path to the file.
:return: number of lines in the designated file.
"""
with open(file) as f:
nb_lines = sum(1 for line in f)
return nb_lines
def split_csv(file, sep=",", output_path=".", nrows=None, chunksize=None, low_memory=True, usecols=None):
"""
Split a csv into several files.
:param file: path to the original csv.
:param sep: View pandas.read_csv doc.
:param output_path: path in which to output the resulting parts of the splitting.
:param nrows: Number of rows to split the original csv by, also view pandas.read_csv doc.
:param chunksize: View pandas.read_csv doc.
:param low_memory: View pandas.read_csv doc.
:param usecols: View pandas.read_csv doc.
"""
nb_of_rows = csv_count_rows(file)
# Parsing file elements : Path, name, extension, etc...
# file_path = "/".join(file.split("/")[0:-1])
file_name = file.split("/")[-1]
# file_ext = file_name.split(".")[-1]
file_name_trunk = file_name.split(".")[0]
split_files_name_trunk = file_name_trunk + "_part_"
# Number of chunks to partition the original file into
nb_of_chunks = math.ceil(nb_of_rows / nrows)
if nrows:
log_debug_process_start = f"The file '{file_name}' contains {nb_of_rows} ROWS. " \
f"\nIt will be split into {nb_of_chunks} chunks of a max number of rows : {nrows}." \
f"\nThe resulting files will be output in '{output_path}' as '{split_files_name_trunk}0 to {nb_of_chunks - 1}'"
logging.debug(log_debug_process_start)
for i in range(nb_of_chunks):
# Number of rows to skip is determined by (the number of the chunk being processed) multiplied by (the nrows parameter).
rows_to_skip = range(1, i * nrows) if i else None
output_file = f"{output_path}/{split_files_name_trunk}{i}.csv"
log_debug_chunk_processing = f"Processing chunk {i} of the file '{file_name}'"
logging.debug(log_debug_chunk_processing)
# Fetching the original csv file and handling it with skiprows and nrows to process its data
df_chunk = pd.read_csv(filepath_or_buffer=file, sep=sep, nrows=nrows, skiprows=rows_to_skip,
chunksize=chunksize, low_memory=low_memory, usecols=usecols)
df_chunk.to_csv(path_or_buf=output_file, sep=sep)
log_info_file_output = f"Chunk {i} of file '{file_name}' created in '{output_file}'"
logging.info(log_info_file_output)
And then in your main or jupyter notebook you put :
# This is how you initiate logging in the most basic way.
logging.basicConfig(level=logging.DEBUG)
file = {#Path to your file}
split_csv(file,sep=";" ,output_path={#Path where you'd like to output it},nrows = 4000000, low_memory = False)
P.S.1 : I put nrows = 4000000 because when it's a personal preference. You can change that number if you wish.
P.S.2 : I used the logging library to display messages. When would apply such a function on big files that exist on a remote server, you really want to avoid 'simple printing' and incorporate logging capabilities. You can replace logging.info or logging.debug with print
P.S.3 : Of course, you need to replace the {# Blablabla} parts of the code with your own parameters.
A simpler script works for me.
import pandas as pd
path = "path to file" # path to file
df = pd.read_csv(path) # reading file
low = 0 # Initial Lower Limit
high = 1000 # Initial Higher Limit
while(high < len(df)):
df_new = df[low:high] # subsetting DataFrame based on index
low = high #changing lower limit
high = high + 1000 # givig uper limit with increment of 1000
df_new.to_csv("Path to output file") # output file
import pandas as pd
df = pd.read_csv('input.csv')
file_len = len(df)
filename = 'output'
n = 1
for i in range(file_len):
if i % 10 == 0:
sf = (df[i:i+10])
sf.to_csv(f'{filename}_{n}.csv', index=False)
n += 1
Building upon the top voted answer, here is a python solution that also includes the headers in each file.
file = open('file.csv', 'r')
header = file.readline()
csvfile = file.readlines()
filename = 1
batch_size = 1000
for i in range(len(csvfile)):
if i % batch_size == 0:
open(str(filename) + '.csv', 'w+').writelines(header)
open(str(filename) + '.csv', 'a+').writelines(csvfile[i:i+batch_size])
filename += 1
This will output the same file names as 1.csv, 2.csv, ... etc.
The following is a very simple solution, that does not loop over all rows, but only on the chunks - imagine if you have millions of rows.
chunk_size = 100_000
for i in range(len(df) // chunk_size + 1):
df[i*chunk_size:(i+1)*chunk_size].to_csv(f"output_{i:02d}.csv",
sep=";", index=False)
You define the chunk size and if the total number of rows is not an integer multiple of the chunk size, the last chunk will contain the rest.
Using f"output_{i:02d}.csv" the suffix will be formatted with two digits and a leading zero
If you want to have a header only for the first chunk (and no header for the other chunks), then you can use a boolean over the suffix index at i == 0, that is:
for i in range(len(df) // chunk_size + 1):
df[i*chunk_size:(i+1)*chunk_size].to_csv(f"output_{i:02d}.csv",
sep=";", index=False, header=(i == 0))
here is my code for readinng individual cell of one csv file. but want to read multiple csv file one by one from .txt file where csv file paths are located.
import csv
ifile = open ("C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv", "rb")
data = list(csv.reader(ifile, delimiter = ';'))
REQ = []
RES = []
n = len(data)
for i in range(n):
x = data[i][1]
y = data[i][2]
REQ.append (x)
RES.append (y)
i += 1
for j in range(2,n):
try:
if REQ[j] != '' and RES[j]!= '': # ignore blank cell
print REQ[j], ' ', RES[j]
except:
pass
j += 1
And csv file paths are stored in a .txt file like
C:\Desktop\Test_Specification\RDBI.csv
C:\Desktop\Test_Specification\ECUreset.csv
C:\Desktop\Test_Specification\RDTC.csv
and so on..
You can read stuff stored in files into variables. And you can use variables with strings in them anywhere you can use a literal string. So...
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip() # or was it trim()? I keep mixing them up
ifile = open(file_name, 'rb')
# ... the rest of your code goes here
Maybe we can fix this up a little...
import csv
with open('mytxtfile.txt', 'r') as txt_file:
for line in txt_file:
file_name = line.strip()
csv_file = csv.reader(open(file_name, 'rb', delimiter=';'))
for record in csv_file[1:]: # skip header row
req = record[1]
res = record[2]
if len(req + res):
print req, ' ', res
you just need to add a while which will read your file containing your list of files & paths upon your first open statement, for example
from __future__ import with_statement
with open("myfile_which_contains_file_path.txt") as f:
for line in f:
ifile = open(line, 'rb')
# here the rest of your code
You need to use a raw string string your path contains \
import csv
file_list = r"C:\Users\BKA4ABT\Desktop\Test_Specification\RDBI.csv"
with open(file_list) as f:
for line in f:
with open(line.strip(), 'rb') as the_file:
reader = csv.reader(the_file, delimiter=';')
for row in reader:
req,res = row[1:3]
if req and res:
print('{0} {1}'.format(req, res))