I getting this weird UnicodeDecodeError and
I don't know why this error is caused but it would be really nice if someone could help me out with this issue:)
Error message:
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 6456:character maps to
Full Error message as an screenshot
screenshot of the Error message
My code:
import os
import json
import random
import csv
from pydub import AudioSegment
file_path = '/path/to/file/.tsv '
save_json_path = '/path/where/you/want/the/jsons/saved'
def main(args):
data = []
directory = file_path.rpartition('/')[0]
percent = int(100)
with open(file_path) as f:
lenght = sum(1 for ine in f)
with open(file_path, newline='') as csvfile:
reader = csv.DictReader(csvfile, delimiter='\t')
index = 1
if(args.convert):
print(str(lenght) + "files found")
for row in reader:
file_name = row['path']
filename = file_name.rpartition('.')[0] + ".wav"
text = row['sentence']
if(args.convert):
data.append({
"key": directory + "/clips/" + filename,
"text": text
})
print("converting file " + str(index) + "/" + str(lenght) + " to wav", end="\r")
src = directory + "/clips/" + file_name
dst = directory + "/clips/" + filename
sound = AudioSegment.from_mp3(src)
sound.export(dst, format="wav")
index = index + 1
else:
data.append({
"key": directory + "/clips/" + file_name,
"text": text
})
random.shuffle(data)
print("creating JSON's")
f = open(save_json_path +"/"+ "train.json", "w")
with open(save_json_path +"/"+ 'train.json','w') as f:
d = len(data)
i=0
while(i<int(d-d/percent)):
r=data[i]
line = json.dumps(r)
f.write(line + "\n")
i = i+1
f = open(save_json_path +"/"+ "test.json", "w")
with open(save_json_path +"/"+ 'test.json','w') as f:
d = len(data)
i=int(d-d/percent)
while(i<d):
r=data[i]
line = json.dumps(r)
f.write(line + "\n")
i = i+1
print("Done!")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="""
Utility script to convert commonvoice into wav and create the training and test json files for speechrecognition. """
)
parser.add_argument('--convert', default=True, action='store_true',
help='says that the script should convert mp3 to wav')
args = parser.parse_known_args()
main(args)
It looks like you're getting this error in this block
with open(file_path) as f:
length = sum(1 for line in f)
In another post, though it doesn't have an accepted answer, this is shown to likely be because of the encoding of your file.
Try adding the encoding kwarg to open
with open(file_path, encoding="latin-1") as f:
length = sum(1 for line in f)
Related
I have the following script that does multiple operations for .txt file (splits by tag, removes line breaks, sentence per line & blank lines in between).
I know how to do this for a specific file:
import re
with open("input.txt", "r") as f:
i = 1
w = None
for line in f:
line = line.strip()
if line:
if w is not None and line == "</div>":
w.close()
i += 1
w = None
else:
if w is None:
w = open('output_%i.txt' % i, 'w')
for s in re.split("(?<=[.!:;?])\s+", line):
w.write(s + '\n\n')
How can I apply this to all .txt files in a dir?
I cannot figure out how to open and write to all files in a dir.
I thought this would work:
import os
import re
path = "/Users/simon/dic/en-new-oxford/output"
for filename in os.listdir(path):
with open(os.path.join(path, filename), 'r') as f:
i = 1
w = None
for line in f:
line = line.strip()
if line:
if w is not None and line == "</d:entry>":
w.close()
i += 1
w = None
else:
if w is None:
w = open(os.path.join('path, filename_%i.txt') % i, 'w')
for s in re.split("(?<=[.!:;?])\s+", line):
w.write(s + '\n\n')
What is wrong about this?
This does work. You probably had some other directory in your directory messing up your code.Also checks if the file ends with '.txt' .
from os import listdir
from os.path import isfile, join
path = ''
# Get all Files in directory , not other directory's.
allFiles = [join(path, file) for file in listdir(path) if isfile(join(path, file)) and file.endswith('.txt')]
for file in allFiles:
with open(file) as f:
"do stuff here"
At the beginning, I simplly use the COM 0x14 to install. And it works when I test it.
But when I use it in my program, it doesn't works.
links = ['https://p-f-t.com/cdn/CaskaydiaCove.zip', 'https://p-f-t.com/cdn/FiraCode.zip']
print('Downloading Nerd Font'.center(45,'=') + '\n')
def downFile(url):
resp = requests.get(url)
file_name = os.path.basename(url)
file = open(file_name, 'wb')
file.write(resp.content)
file.close()
print('Download', file_name, 'done!')
return file_name
font_zip_list = []
for link in links:
font_zip_list.append(downFile(link))
print('Installing Nerd Font'.center(45,'=') + '\n')
font_inst_script = '''$FONTS = 0x14
$objShell = New-Object -ComObject Shell.Application
$objFolder = $objShell.Namespace($FONTS)'''
def unzip_inst(zipfile_name):
zip_file = zipfile.ZipFile(zipfile_name, 'r')
font_list = []
for i in zip_file.namelist():
zip_file.extract(i, os.getcwd())
if i.rfind(r'/') != len(i) - 1:
print('Extracted', i, 'from', zipfile_name)
font_list.append(i)
return font_list
font_list = []
for z in font_zip_list:
font_list += unzip_inst(z)
print(font_list)
for f in font_list:
font_inst_script += '\n$objFolder.CopyHere("{font_path}")'.format(font_path='./' + f)
script = open('install.ps1', 'w')
script.write(font_inst_script)
script.close()
os.system('powershell.exe ./install.ps1')
print(''.center(45,'=') + '\n')
I need a simple way to install fonts families on python or powershell.
I know there's a lot of content about reading & writing out there, but I'm still not quite finding what I need specifically.
I have 5 files (i.e. in1.txt, in2.txt, in3.txt....), and I want to open/read, run the data through a function I have, and then output the new returned value to corresponding new files (i.e. out1.txt, out2.txt, out3.txt....)
I want to do this in one program run. I'm not sure how to write the loop to process all the numbered files in one run.
If you want them to be processed serially, you can use a for loop as follows:
inpPrefix = "in"
outPrefix = "out"
for i in range(1, 6):
inFile = inPrefix + str(i) + ".txt"
with open(inFile, 'r') as f:
fileLines = f.readlines()
# process content of each file
processedOutput = process(fileLines)
#write to file
outFile = outPrefix + str(i) + ".txt"
with open(outFile, 'w') as f:
f.write(processedOutput)
Note: This assumes that the input and output files are in the same directory as the script is in.
If you are looking just for running one by one separately you can do:
import os
count = 0
directory = "dir/where/your/files/are/"
for filename in os.listdir(directory):
if filename.endswith(".txt"):
count += 1
with open(directory + filename, "r") as read_file:
return_of_your_function = do_something_with_data()
with open(directory + count + filename, "w") as write_file:
write_file.write(return_of_your_function)
Here, you go! I would do something like this:
(Assuming all the input .txt files are in the same input folder)
input_path = '/path/to/input/folder/'
output_path = '/path/to/output/folder/'
for count in range(1,6):
input_file = input_path + 'in' + str(count) + '.txt'
output_file = output_path + 'out' + str(count) + '.txt'
with open(input_file, 'r') as f:
content = f.readlines()
output = process_input(content)
with open(output_file, 'w') as f:
w.write(output)
My program does all that I want, but is not saving the final data to the csv file, I used a print before it to see if the data was right and it is, It is just not writing to the csv file, I'm using 'a' because I don't want it to rewrite what's already written, but it is still returning an error.
here's the part of the code:
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
#csvline.encode('utf-8')
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(csvline)
Here's the error:
Traceback (most recent call last):
File "C:\PROJECT\pdfs\final.py", line 95, in <module>
with open ('output_file_two.csv', 'a', encoding='utf-8') as f:
TypeError: 'encoding' is an invalid keyword argument for this function
Here's the entire program code in case of need
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
#import unicodecsv as csv
import csv
#import pickle
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
# Change this to your PDF file base directory
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
# Change this to your pdf2htmlEX executable location
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
# If this is not a PDF file
if not file_name.endswith('.pdf'):
# Skip it
continue
file_path = os.path.join(dir_path, file_name)
# Convert your PDF to HTML here
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)#To remove the names that appear, just remove the last (\S*), to add them is just add the (\S*), before it there was a \s*
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0] # Get the first form on the page
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
# Use BeautifulSoup to parse this data
answer = browser.response.text
#print(answer)
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
#heading = row.find('td', {"class":"sectionHeading"})
#if heading is not None:
#print(heading.get_text());
#else:
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow(csvline)
EDIT
It's working, here's the code working
import shlex
import subprocess
import os
import platform
from bs4 import BeautifulSoup
import re
import unicodecsv as csv
import requests
from robobrowser import RoboBrowser
import codecs
def rename_files():
file_list = os.listdir(r"C:\\PROJECT\\pdfs")
print(file_list)
saved_path = os.getcwd()
print('Current working directory is '+saved_path)
os.chdir(r'C:\\PROJECT\\pdfs')
for file_name in file_list:
os.rename(file_name, file_name.translate(None, " "))
os.chdir(saved_path)
rename_files()
def run(command):
if platform.system() != 'Windows':
args = shlex.split(command)
else:
args = command
s = subprocess.Popen(args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
output, errors = s.communicate()
return s.returncode == 0, output, errors
base_directory = 'C:\\PROJECT\\pdfs'
if not os.path.isdir(base_directory):
print "%s is not a directory" % base_directory
exit(1)
bin_path = 'C:\\Python27\\pdfminer-20140328\\tools\\pdf2txt.py'
if not os.path.isfile(bin_path):
print "Could not find %s" % bin_path
exit(1)
for dir_path, dir_name_list, file_name_list in os.walk(base_directory):
for file_name in file_name_list:
if not file_name.endswith('.pdf'):
continue
file_path = os.path.join(dir_path, file_name)
args = (bin_path, file_name, file_path)
success, output, errors = run("python %s -o %s.html %s " %args)
if not success:
print "Could not convert %s to HTML" % file_path
print "%s" % errors
htmls_path = 'C:\\PROJECT'
with open ('score.csv', 'w') as f:
writer = csv.writer(f)
for dir_path, dir_name_list, file_name_list in os.walk(htmls_path):
for file_name in file_name_list:
if not file_name.endswith('.html'):
continue
with open(file_name) as markup:
soup = BeautifulSoup(markup.read())
text = soup.get_text()
match = re.findall("PA/(\S*)", text)
print(match)
writer.writerow(match)
for item in match:
data = item.split('/')
case_number = data[0]
case_year = data[1]
csvline = case_number + ","
browser = RoboBrowser()
browser.open('http://www.pa.org.mt/page.aspx?n=63C70E73&CaseType=PA')
form = browser.get_forms()[0]
form['ctl00$PageContent$ContentControl$ctl00$txtCaseNo'].value = case_number
form['ctl00$PageContent$ContentControl$ctl00$txtCaseYear'].value = case_year
browser.submit_form(form, submit=form['ctl00$PageContent$ContentControl$ctl00$btnSubmit'])
answer = browser.response.text
soup = BeautifulSoup(answer)
for table in soup.findAll('table', {"class":"formTable"}):
for row in table.findAll('tr'):
label = row.find('td', {"class":"fieldLabel"})
data = row.find('td', {"class":"fieldData"})
if data is not None and label is not None:
csvline += label.get_text() + "," + data.get_text() + ","
print(csvline)
my_file = codecs.open('final_output.csv', 'a', 'utf-8')
my_file.write(csvline)
At the end there is a problem with your code
writer = csv.writer(f)
csv.writer(csvline) # here is the problem
See you initialize the writer, but then you don't use it.
writer = csv.writer(f)
writer.writerow(csvline)
Here :
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
csv.writer (csvline)
You are instanciating a csv.writer, but not using it. This should read:
with open ('output_file_two.csv', 'a') as f:
writer = csv.writer(f)
writer.write(csvline)
Now there are quite a few other problems with your code, the first one being to manually create the 'csvline as text then using csv.writer to store it to file. csv.writer.write() expects a list of rows (tuples) and takes care of properly escaping what needs to be escaped, inserting the proper delimiters etc. It also has a writerow() method that takes a single tuple and so avoid building the whole list in memory FWIW.
I am trying to read all fasta files from test folder and put the name of file in all headers of individual file. The code working for first file and dont proceed to second file and return error. Could you help me find bug in my code or edit it. Thanks
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(file, "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')
It would be good to provide the error message you are getting! I think this must fail with "File not found" because you try to open the file by name instead of path. Try fp = open(os.path.join(path, file), "r"):
import sys, glob, os, string
header = ''
check = 0
path = "./test/"
dirs = os.listdir(path)
for file in dirs:
fp = open(os.path.join(path, file), "r")
fpx = open('%s_output.txt' % file, 'w')
for line in fp:
if line.startswith('>'):
line = line.rstrip()
check = check + 1
if check >= 1:
header = line
fpx.write(header + '_' + file + '\n')
else:
line = line.rstrip()
fpx.write(line + '\n')