I have a code in python to index a text file that contain arabic words. I tested the code on an english text and it works well ,but it gives me an error when i tested an arabic one.
Note: the text file is saved in unicode encoding not in ANSI encoding.
This is my code:
from whoosh import fields, index
import os.path
import csv
import codecs
from whoosh.qparser import QueryParser
# This list associates a name with each position in a row
columns = ["juza","chapter","verse","voc"]
schema = fields.Schema(juza=fields.NUMERIC,
chapter=fields.NUMERIC,
verse=fields.NUMERIC,
voc=fields.TEXT)
# Create the Whoosh index
indexname = "indexdir"
if not os.path.exists(indexname):
os.mkdir(indexname)
ix = index.create_in(indexname, schema)
# Open a writer for the index
with ix.writer() as writer:
with open("h.txt", 'r') as txtfile:
lines=txtfile.readlines()
# Read each row in the file
for i in lines:
# Create a dictionary to hold the document values for this row
doc = {}
thisline=i.split()
u=0
# Read the values for the row enumerated like
# (0, "juza"), (1, "chapter"), etc.
for w in thisline:
# Get the field name from the "columns" list
fieldname = columns[u]
u+=1
#if isinstance(w, basestring):
# w = unicode(w)
doc[fieldname] = w
# Pass the dictionary to the add_document method
writer.add_document(**doc)
with ix.searcher() as searcher:
query = QueryParser("voc", ix.schema).parse(u"بسم")
results = searcher.search(query)
print(len(results))
print(results[1])
Then the error is :
Traceback (most recent call last):
File "C:\Python27\yarab.py", line 38, in <module>
fieldname = columns[u]
IndexError: list index out of range
this is a sample of the file:
1 1 1 كتاب
1 1 2 قرأ
1 1 3 لعب
1 1 4 كتاب
While I cannot see anything obviously wrong with that, I would make sure you're designing for error. Make sure you catch any situation where split() returns more than expected amount of elements and handle it promptly (e.g. print and terminate). It looks like you might be dealing with ill-formatted data.
You missed the header of Unicode in your script. the first line should be:
encoding: utf-8
Also to open a file with the unicode encoding use:
import codecs
with codecs.open("s.txt",encoding='utf-8') as txtfile:
Related
everyone. Need help opening and reading the file.
Got this txt file - https://yadi.sk/i/1TH7_SYfLss0JQ
It is a dictionary
{"id0":"url0", "id1":"url1", ..., "idn":"urln"}
But it was written using json into txt file.
#This is how I dump the data into a txt
json.dump(after,open(os.path.join(os.getcwd(), 'before_log.txt'), 'a'))
So, the file structure is
{"id0":"url0", "id1":"url1", ..., "idn":"urln"}{"id2":"url2", "id3":"url3", ..., "id4":"url4"}{"id5":"url5", "id6":"url6", ..., "id7":"url7"}
And it is all a string....
I need to open it and check repeated ID, delete and save it again.
But getting - json.loads shows ValueError: Extra data
Tried these:
How to read line-delimited JSON from large file (line by line)
Python json.loads shows ValueError: Extra data
json.decoder.JSONDecodeError: Extra data: line 2 column 1 (char 190)
But still getting that error, just in different place.
Right now I got as far as:
with open('111111111.txt', 'r') as log:
before_log = log.read()
before_log = before_log.replace('}{',', ').split(', ')
mu_dic = []
for i in before_log:
mu_dic.append(i)
This eliminate the problem of several {}{}{} dictionaries/jsons in a row.
Maybe there is a better way to do this?
P.S. This is how the file is made:
json.dump(after,open(os.path.join(os.getcwd(), 'before_log.txt'), 'a'))
Your file size is 9,5M, so it'll took you a while to open it and debug it manually.
So, using head and tail tools (found normally in any Gnu/Linux distribution) you'll see that:
# You can use Python as well to read chunks from your file
# and see the nature of it and what it's causing a decode problem
# but i prefer head & tail because they're ready to be used :-D
$> head -c 217 111111111.txt
{"1933252590737725178": "https://instagram.fiev2-1.fna.fbcdn.net/vp/094927bbfd432db6101521c180221485/5CC0EBDD/t51.2885-15/e35/46950935_320097112159700_7380137222718265154_n.jpg?_nc_ht=instagram.fiev2-1.fna.fbcdn.net",
$> tail -c 219 111111111.txt
, "1752899319051523723": "https://instagram.fiev2-1.fna.fbcdn.net/vp/a3f28e0a82a8772c6c64d4b0f264496a/5CCB7236/t51.2885-15/e35/30084016_2051123655168027_7324093741436764160_n.jpg?_nc_ht=instagram.fiev2-1.fna.fbcdn.net"}
$> head -c 294879 111111111.txt | tail -c 12
net"}{"19332
So the first guess is that your file is a malformed series ofJSON data, and the best guess is to seperate }{ by a \n for further manipulations.
So, here is an example of how you can solve your problem using Python:
import json
input_file = '111111111.txt'
output_file = 'new_file.txt'
data = ''
with open(input_file, mode='r', encoding='utf8') as f_file:
# this with statement part can be replaced by
# using sed under your OS like this example:
# sed -i 's/}{/}\n{/g' 111111111.txt
data = f_file.read()
data = data.replace('}{', '}\n{')
seen, total_keys, to_write = set(), 0, {}
# split the lines of the in memory data
for elm in data.split('\n'):
# convert the line to a valid Python dict
converted = json.loads(elm)
# loop over the keys
for key, value in converted.items():
total_keys += 1
# if the key is not seen then add it for further manipulations
# else ignore it
if key not in seen:
seen.add(key)
to_write.update({key: value})
# write the dict's keys & values into a new file as a JSON format
with open(output_file, mode='a+', encoding='utf8') as out_file:
out_file.write(json.dumps(to_write) + '\n')
print(
'found duplicated key(s): {seen} from {total}'.format(
seen=total_keys - len(seen),
total=total_keys
)
)
Output:
found duplicated key(s): 43836 from 45367
And finally, the output file will be a valid JSON file and the duplicated keys will be removed with their values.
The basic difference between the file structure and actual json format is the missing commas and the lines are not enclosed within [. So the same can be achieved with the below code snippet
with open('json_file.txt') as f:
# Read complete file
a = (f.read())
# Convert into single line string
b = ''.join(a.splitlines())
# Add , after each object
b = b.replace("}", "},")
# Add opening and closing parentheses and ignore last comma added in prev step
b = '[' + b[:-1] + ']'
x = json.loads(b)
I am reading in a CSV file in Python that looks like this:
REGION,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
Alabama,2138093,2348174,2646248,2832961,3061743,3266740,3444165,3893888,4040587,4447100,4779736
Alaska,64356,55036,59278,72524,128643,226167,300382,401851,550043,626932,710231
My problem is that when i read the first line it reads it as
REGION,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
which in first place doesn't seem as much as a problem.
But later on I look for a number so a split the string into a list
lijst_eerste_regel = self.eerste_regel.split(",")
and then look for the index of str(2010) but Python then seems to look for '2010' not "2010". Therefor it won't find the index.
I post the code right here(it is in a class I am having this problem, not sure if that is relevant or not)
import io
class Volkstelling:
def __init__(self,jaartal,csvb):
"""
>>> vs2010 = Volkstelling(2010, 'vs_bevolkingsaantal.csv')
"""
import csv
self.jaartal = jaartal
self.csvb = csvb
self.eerste_regel = next(self.csvb)
if str(jaartal) not in self.eerste_regel:
raise AssertionError ("geen gegevens beschikbaar")
def inwoners(self, regio):
lijst_eerste_regel = self.eerste_regel.split(",")
plaats_jaartal = lijst_eerste_regel.index(self.jaartal) # here is where the error occurs
data = """REGION,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
Alabama,2138093,2348174,2646248,2832961,3061743,3266740,3444165,3893888,4040587,4447100,4779736
Alaska,64356,55036,59278,72524,128643,226167,300382,401851,550043,626932,710231"""
v = Volkstelling('2010',io.StringIO(data))
v.inwoners('Alabama')
## ValueError: '2010' not in list
Your code had several issues leading to 2010 being not found:
If you read in files, each line has a newline character, commonly represented as \n, at the end. Insert the following code into your inwoners function to see the newline character behind 2010:
print(lijst_eerste_regel)
You can remove whitespaces and newlines using the python function 'SOME STRING'.strip()
Your function did not return a value, so you get None from inwoners even if it would run correctly.
The following example works:
import io
class Volkstelling:
def __init__(self,jaartal,csvb):
"""
>>> vs2010 = Volkstelling(2010, 'vs_bevolkingsaantal.csv')
"""
import csv
self.jaartal = jaartal
self.csvb = csvb
self.eerste_regel = next(self.csvb)
if str(jaartal) not in self.eerste_regel:
raise AssertionError ("geen gegevens beschikbaar")
def inwoners(self, regio):
lijst_eerste_regel = [s.strip() for s in self.eerste_regel.split(",")]
plaats_jaartal = lijst_eerste_regel.index(self.jaartal)
return plaats_jaartal # Returns the column index where to find the no of inhabitants
data = """REGION,1910,1920,1930,1940,1950,1960,1970,1980,1990,2000,2010
Alabama,2138093,2348174,2646248,2832961,3061743,3266740,3444165,3893888,4040587,4447100,4779736
Alaska,64356,55036,59278,72524,128643,226167,300382,401851,550043,626932,710231"""
v2 = Volkstelling('1920',io.StringIO(data))
print(v2.inwoners('Alabama'))
## -> prints 2
v1 = Volkstelling('2010',io.StringIO(data))
print(v1.inwoners('Alabama'))
## -> prints 11
I have a file named sample.txt which looks like below
ServiceProfile.SharediFCList[1].DefaultHandling=1
ServiceProfile.SharediFCList[1].ServiceInformation=
ServiceProfile.SharediFCList[1].IncludeRegisterRequest=n
ServiceProfile.SharediFCList[1].IncludeRegisterResponse=n
Here my requirement is to remove the brackets and the integer and enter os commands with that
ServiceProfile.SharediFCList.DefaultHandling=1
ServiceProfile.SharediFCList.ServiceInformation=
ServiceProfile.SharediFCList.IncludeRegisterRequest=n
ServiceProfile.SharediFCList.IncludeRegisterResponse=n
I am quite a newbie in Python. This is my first attempt. I have used these codes to remove the brackets:
#!/usr/bin/python
import re
import os
import sys
f = os.open("sample.txt", os.O_RDWR)
ret = os.read(f, 10000)
os.close(f)
print ret
var1 = re.sub("[\(\[].*?[\)\]]", "", ret)
print var1f = open("removed.cfg", "w+")
f.write(var1)
f.close()
After this using the file as input I want to form application specific commands which looks like this:
cmcli INS "DefaultHandling=1 ServiceInformation="
and the next set as
cmcli INS "IncludeRegisterRequest=n IncludeRegisterRequest=y"
so basically now I want the all the output to be bunched to a set of two for me to execute the commands on the operating system.
Is there any way that I could bunch them up as set of two?
Reading 10,000 bytes of text into a string is really not necessary when your file is line-oriented text, and isn't scalable either. And you need a very good reason to be using os.open() instead of open().
So, treat your data as the lines of text that it is, and every two lines, compose a single line of output.
from __future__ import print_function
import re
command = [None,None]
cmd_id = 1
bracket_re = re.compile(r".+\[\d\]\.(.+)")
# This doesn't just remove the brackets: what you actually seem to want is
# to pick out everything after [1]. and ignore the rest.
with open("removed_cfg","w") as outfile:
with open("sample.txt") as infile:
for line in infile:
m = bracket_re.match(line)
cmd_id = 1 - cmd_id # gives 0, 1, 0, 1
command[cmd_id] = m.group(1)
if cmd_id == 1: # we have a pair
output_line = """cmcli INS "{0} {1}" """.format(*command)
print (output_line, file=outfile)
This gives the output
cmcli INS "DefaultHandling=1 ServiceInformation="
cmcli INS "IncludeRegisterRequest=n IncludeRegisterResponse=n"
The second line doesn't correspond to your sample output. I don't know how the input IncludeRegisterResponse=n is supposed to become the output IncludeRegisterRequest=y. I assume that's a mistake.
Note that this code depends on your input data being precisely as you describe it and has no error checking whatsoever. So if the format of the input is in reality more variable than that, then you will need to add some validation.
I have a script that was working earlier but now stops due to UnicodeEncodeError.
I am using Python 3.4.3.
The full error message is the following:
Traceback (most recent call last):
File "R:/A/APIDevelopment/ScivalPubsExternal/Combine/ScivalPubsExt.py", line 58, in <module>
outputFD.writerow(row)
File "C:\Python34\lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode character '\x8a' in position 413: character maps to <undefined>
How can I address this error?
The Python script is the following below:
import pdb
import csv,sys,os
import glob
import os
import codecs
os.chdir('R:/A/APIDevelopment/ScivalPubsExternal/Combine')
joinedFileOut='ScivalUpdate'
csvSourceDir="R:/A/APIDevelopment/ScivalPubsExternal/Combine/AustralianUniversities"
# create dictionary from Codes file (Institution names and codes)
codes = csv.reader(open('Codes.csv'))
#rows of the file are stored as lists/arrays
InstitutionCodesDict = {}
InstitutionYearsDict = {}
for row in codes:
#keys: instnames, #values: instcodes
InstitutionCodesDict[row[0]] = row[1]
#define year dictionary with empty values field
InstitutionYearsDict[row[0]] = []
#to create a fiel descriptor for the outputfile, wt means text mode (also rt opr r is the same)
with open(joinedFileOut,'wt') as csvWriteFD:
#write the file (it is still empty here)
outputFD=csv.writer(csvWriteFD,delimiter=',')
#with closes the file at the end, if exception occurs then before that
# open each scival file, create file descriptor (encoding needed) and then read it and print the name of the file
if not glob.glob(csvSourceDir+"/*.csv"):
print("CSV source files not found")
sys.exit()
for scivalFile in glob.glob(csvSourceDir+"/*.csv"):
#with open(scivalFile,"rt", encoding="utf8") as csvInFD:
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
print(scivalFile)
#create condition for loop
printon=False
#reads all rows in file and creates lists/arrays of each row
for row in fileFD:
if len(row)>1:
#the next printon part is skipped when looping through the rows above the data because it is not set to true
if printon:
#inserts instcode and inst sequentially to each row where there is data and after the header row
row.insert(0, InstitutionCode)
row.insert(0, Institution)
if row[10].strip() == "-":
row[10] = " "
else:
p = row[10].zfill(8)
q = p[0:4] + '-' + p[4:]
row[10] = q
#writes output file
outputFD.writerow(row)
else:
if "Publications at" in row[1]:
#get institution name from cell B1
Institution=row[1].replace('Publications at the ', "").replace('Publications at ',"")
print(Institution)
#lookup institution code from dictionary
InstitutionCode=InstitutionCodesDict[Institution]
#printon gets set to TRUE after the header column
if "Title" in row[0]: printon=True
if "Publication years" in row[0]:
#get the year to print it later to see which years were pulled
year=row[1]
#add year to institution in dictionary
if not year in InstitutionYearsDict[Institution]:
InstitutionYearsDict[Institution].append(year)
# Write a report showing the institution name followed by the years for
# which we have that institution's data.
with open("Instyears.txt","w") as instReportFD:
for inst in (InstitutionYearsDict):
instReportFD.write(inst)
for yr in InstitutionYearsDict[inst]:
instReportFD.write(" "+yr)
instReportFD.write("\n")
Make sure to use the correct encoding of your source and destination files. You open files in three locations:
codes = csv.reader(open('Codes.csv'))
: : :
with open(joinedFileOut,'wt') as csvWriteFD:
outputFD=csv.writer(csvWriteFD,delimiter=',')
: : :
with open(scivalFile,"rt", encoding="ISO-8859-1") as csvInFD:
fileFD = csv.reader(csvInFD)
This should look something like:
# Use the correct encoding. If you made this file on
# Windows it is likely Windows-1252 (also known as cp1252):
with open('Codes.csv', encoding='cp1252') as f:
codes = csv.reader(f)
: : :
# The output encoding can be anything you want. UTF-8
# supports all Unicode characters. Windows apps tend to like
# the files to start with a UTF-8 BOM if the file is UTF-8,
# so 'utf-8-sig' is an option.
with open(joinedFileOut,'w', encoding='utf-8-sig') as csvWriteFD:
outputFD=csv.writer(csvWriteFD)
: : :
# This file is probably the cause of your problem and is not ISO-8859-1.
# Maybe UTF-8 instead? 'utf-8-sig' will safely handle and remove a UTF-8 BOM
# if present.
with open(scivalFile,'r', encoding='utf-8-sig') as csvInFD:
fileFD = csv.reader(csvInFD)
The error is caused by an attempt to write a string containing a U+008A character using the default cp1252 encoding of your system. It is trivial to fix, just declare a latin1 encoding (or iso-8859-1) for your output file (because it just outputs the original byte without conversion):
with open(joinedFileOut,'wt', encoding='latin1') as csvWriteFD:
But this will only hide the real problem: where does this 0x8a character come from? My advice is to intercept the exception and dump the line where it occurs:
try:
outputFD.writerow(row)
except UnicodeEncodeError:
# print row, the name of the file being processed and the line number
It is probably caused by one of the input files not being is-8859-1 encoded but more probably utf8 encoded...
I'm trying to make an script which takes all rows starting by 'HELIX', 'SHEET' and 'DBREF' from a .txt, from that rows takes some specifical columns and then saves the results on a new file.
#!/usr/bin/python
import sys
if len(sys.argv) != 3:
print("2 Parameters expected: You must introduce your pdb file and a name for output file.")`
exit()
for line in open(sys.argv[1]):
if 'HELIX' in line:
helix = line.split()
cols_h = helix[0], helix[3:6:2], helix[6:9:2]
elif 'SHEET'in line:
sheet = line.split()
cols_s = sheet[0], sheet[4:7:2], sheet[7:10:2], sheet [12:15:2], sheet[16:19:2]
elif 'DBREF' in line:
dbref = line.split()
cols_id = dbref[0], dbref[3:5], dbref[8:10]
modified_data = open(sys.argv[2],'w')
modified_data.write(cols_id)
modified_data.write(cols_h)
modified_data.write(cols_s)
My problem is that when I try to write my final results it gives this error:
Traceback (most recent call last):
File "funcional2.py", line 21, in <module>
modified_data.write(cols_id)
TypeError: expected a character buffer object
When I try to convert to a string using ''.join() it returns another error
Traceback (most recent call last):
File "funcional2.py", line 21, in <module>
modified_data.write(' '.join(cols_id))
TypeError: sequence item 1: expected string, list found
What am I doing wrong?
Also, if there is some easy way to simplify my code, it'll be great.
PS: I'm no programmer so I'll probably need some explanation if you do something...
Thank you very much.
cols_id, cols_h and cols_s seems to be lists, not strings.
You can only write a string in your file so you have to convert the list to a string.
modified_data.write(' '.join(cols_id))
and similar.
'!'.join(a_list_of_things) converts the list into a string separating each element with an exclamation mark
EDIT:
#!/usr/bin/python
import sys
if len(sys.argv) != 3:
print("2 Parameters expected: You must introduce your pdb file and a name for output file.")`
exit()
cols_h, cols_s, cols_id = []
for line in open(sys.argv[1]):
if 'HELIX' in line:
helix = line.split()
cols_h.append(''.join(helix[0]+helix[3:6:2]+helix[6:9:2]))
elif 'SHEET'in line:
sheet = line.split()
cols_s.append( ''.join(sheet[0]+sheet[4:7:2]+sheet[7:10:2]+sheet[12:15:2]+sheet[16:19:2]))
elif 'DBREF' in line:
dbref = line.split()
cols_id.append(''.join(dbref[0]+dbref[3:5]+dbref[8:10]))
modified_data = open(sys.argv[2],'w')
cols = [cols_id,cols_h,cols_s]
for col in cols:
modified_data.write(''.join(col))
Here is a solution (untested) that separates data and code a little more. There is a data structure (keyword_and_slices) describing the keywords searched in the lines paired with the slices to be taken for the result.
The code then goes through the lines and builds a data structure (keyword2lines) mapping the keyword to the result lines for that keyword.
At the end the collected lines for each keyword are written to the result file.
import sys
from collections import defaultdict
def main():
if len(sys.argv) != 3:
print(
'2 Parameters expected: You must introduce your pdb file'
' and a name for output file.'
)
sys.exit(1)
input_filename, output_filename = sys.argv[1:3]
#
# Pairs of keywords and slices that should be taken from the line
# starting with the respective keyword.
#
keyword_and_slices = [
('HELIX', [slice(3, 6, 2), slice(6, 9, 2)]),
(
'SHEET',
[slice(a, b, 2) for a, b in [(4, 7), (7, 10), (12, 15), (16, 19)]]
),
('DBREF', [slice(3, 5), slice(8, 10)]),
]
keyword2lines = defaultdict(list)
with open(input_filename, 'r') as lines:
for line in lines:
for keyword, slices in keyword_and_slices:
if line.startswith(keyword):
parts = line.split()
result_line = [keyword]
for index in slices:
result_line.extend(parts[index])
keyword2lines[keyword].append(' '.join(result_line) + '\n')
with open(output_filename, 'w') as out_file:
for keyword in ['DBREF', 'HELIX', 'SHEET']:
out_file.writelines(keyword2lines[keyword])
if __name__ == '__main__':
main()
The code follows your text in checking if a line starts with a keyword, instead your code which checks if a keyword is anywhere within a line.
It also makes sure all files are closed properly by using the with statement.
You need to convert the tuple created on RHS in your assignments to string.
# Replace this with statement given below
cols_id = dbref[0], dbref[3:5], dbref[8:10]
# Create a string out of the tuple
cols_id = ''.join((dbref[0], dbref[3:5], dbref[8:10]))