All text is saved in one line - python

So, I was trying to use NLTK from Python to do a part of speech tagging to a text file.
This is the code I used
import nltk
from nltk import word_tokenize, pos_tag
f = open('all.txt')
raw = f.read()
text = word_tokenize(raw)
paosted = nltk.pos_tag(text)
saveFile = open('ol.txt', 'w')
saveFile.write(str(paosted))
saveFile.close()
The code did work, but the problem is that it saved all the text in one single line as shown in the attached picture. as shown here .. I know I should be using a "\n" function, but I am a novice in python and have no idea how to do it, so any help would be appreciated :) ..
-------- UPDATE -----------
WELL, People have been really helpful and offered some solutions i.e., this code:
import nltk
from nltk import word_tokenize, pos_tag
f = open('all.txt')
raw = f.read()
text = word_tokenize(raw)
paosted = nltk.pos_tag(text)
saveFile.write(str(paosted).replace('),' , '),\n'))
saveFile.close()
But I still need to have it in the form of a paragraph because I am going to use it latter in a concordance software. Please have a look at this screenshot:
https://i.stack.imgur.com/tU1NW.png

paosted is a list of tuple you can iterate over it and write each tuple to a line
Ex:
paosted = nltk.pos_tag(text)
saveFile = open('ol.txt', 'w')
for line in paosted:
saveFile.write(str(line)+ "\n")
saveFile.close()

Updating my answer accordingly to,
temp = []
for i in paosted:
temp.append("_".join(i))
" ".join(temp)

Thank you all! I followed some of your instructions and the best result I got was with this code:
import nltk
from nltk import word_tokenize, pos_tag
f = open('all.txt')
raw = f.read()
text = word_tokenize(raw)
paosted = nltk.pos_tag(text)
saveFile = open('output.txt', 'w')
saveFile.write(str(paosted).replace("('.', '.')" , "\n"))
saveFile.close()

Related

Search and replace strings in XML using python

I am trying to search and replace certain words in my .xml file and replace it with another, but I struggle a bit.
I have been using this code so far:
import xml.etree.ElementTree as ET
with open('Rom1.xml', encoding="utf8") as f:
tree = ET.parse(f)
#root = tree.find('ExportedObjects')
root = tree.getroot()
for elem in root.iter():
try:
elem.text = elem.text.replace('Rom1', 'Rom2')
except AttributeError:
pass
Rom1.xml this is a snapshot from the XML file showing the structure
The XML file is pretty big but it contains the string 'Rom1' 41 times and I would like to replace all of them.
I know a simple search and replace in text editor does the job, but I want to automate this since I will do it for several hundered of files.
Any help is appriciated :)
If there is no possibility of ambiguity then you could just do this:
with open('Rom1.xml', encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('Rom1', 'Rom2')
xml.seek(0)
xml.write(content)
xml.truncate()
In this case the truncate() call is not necessary. However, if the second argument to replace() was shorter than the first then this would be crucial. Just leave it there to account for all eventualities
Ok so I tried something else with great success:
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
But if I want to replace a second string f.ex 'SQ4XXX' to 'SQ4050' then it replaces both and keeps the old as well? I'm confused.
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
sq = input('SQ: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
f.write(line.replace('SQ4XXX', sq))
Ok I got it working like I wanted, thanks for the help guys!
Heres the final code:
import xml.etree.ElementTree as ET
Rom2 = input('Number: ')
sq4 = input('SQ4: ')
sq5 = input('SQ5: ')
input_file = "Rom1.xml"
output_file = Rom2+".xml"
with open(input_file) as f:
xml_content = f.readlines()
with open(output_file, 'w+') as f:
for line in xml_content:
f.write(line.replace('Rom1', Rom2))
with open(output_file, encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('SQ4XXX', sq4)
xml.seek(0)
xml.write(content)
xml.truncate()
with open(output_file, encoding='utf-8', mode='r+') as xml:
content = xml.read().replace('SQ5XXX', sq5)
xml.seek(0)
xml.write(content)
xml.truncate()er code here

reader.decrypt(word) NotImplementedError: only algorithm code 1 and 2 are supported

I am using PyPDF2 library to decrypt the PDF file. Instead of directly providing the password, I am using wordlist.
import PyPDF2
ENCRYPTED_FILE_PATH = 'pdf_protected.pdf'
pdffile = open('dictionary.txt')
w = pdffile.read()
wordlist = w.split('\n')
with open(ENCRYPTED_FILE_PATH, mode='rb') as f:
reader = PyPDF2.PdfFileReader(f)
if reader.isEncrypted:
for word in wordlist:
reader.decrypt(word)
print(f"Number of page: {reader.getNumPages()}")
And I'm not understanding how to solve this error
Thank you :)

How to write a list of sentences into text file in Python

Hi. I would like to ask on how to print a list of sentences into text file. I try to use write() function to export output shown below but I couldn't get the output like in the python shell.
import os
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
def preprocess():
with open("E:\FYP 2\Error Detection Data\Data\Raw Data\D1.txt", "r") as fin, open("E:\FYP 2\Error Detection Data\Data\Raw Data\D1_edit.txt", "w") as fout:
for sent in sent_tokenize(fin.read()):
words = word_tokenize(sent)
tag = pos_tag(words)
processed_sentence = [w for w in tag]
print (processed_sentence)
print ("\n")
for i in range(0,len(processed_sentence)-1):
sentsplit = processed_sentence[i]
fout.write('\n'.join(sentsplit))
fin.close()
fout.close()
preprocess()
Current output in text file:
Android
NNPsmartphones
NNSplay
VBPa
DTmajor
JJrole
NNin
INtoday
NN's
POSworld
NNThe
Output that I want in text file:
Android,NNP smartphones,NNS play,VBP a,DT major,JJ role,NN in,IN today,NN 's,POS world,NN .,.
Instead of:
for i in range(0,len(processed_sentence)-1):
sentsplit = processed_sentence[i]
fout.write('\n'.join(sentsplit))
In python3 try:
print(processed_sentence, file=fout)
In python2 try:
print >> fout, processed_sentence
You also do not need to fout.close() or the fin.close() because the with context manager will handle that for you.

Optimization of Parsing of Python Scripts

I want to apply regex for every newline in my txt file.
For example
comments={ts=2010-02-09T04:05:20.777+0000,comment_id=529590|2886|LOL|Baoping Wu|529360}
comments={ts=2010-02-09T04:20:53.281+0000, comment_id=529589|2886|cool|Baoping Wu|529360}
comments={ts=2010-02-09T05:19:19.802+0000,comment_id=529591|2886|ok|Baoping Wu|529360}
My Python Code is:
import re
p = re.compile(ur'(comment_id=)(\d+)\|(\d+)\|([^|]+)\|([^|]+)\|(\d+)', re.MULTILINE|re.DOTALL)
#open =
test_str = r"comments={ts=2010-02-09T04:05:20.777+0000, comment_id=529590|2886|LOL|Baoping Wu|529360}"
subst = ur"\1\2, user_id = \3, comment='\4', user= '\5', post_commented=\6"
result = re.sub(p, subst, test_str)
print result
I want to solve it with help of MULTILINE, but it doesnt Work.
Can anyone help me
The Output for the first line should be
comments={ts=2010-02-09T04:05:20.777+0000, comment_id=529590, user_id = 2886, comment='LOL', user= 'Baoping Wu', post_commented=529360}
My issue is only to apply the regex for every line and write it on txt file.
Your regex works without having to use MULTILINE or DOTALL. You can replace through the entire document at once. In action
import re
with open('file.txt', 'r') as f:
txt = f.read()
pattern = r'(comment_id=)(\d+)\|(\d+)\|([^|]+)\|([^|]+)\|(\d+)'
repl = r"\1\2, user_id = \3, comment='\4', user= '\5', post_commented=\6"
result = re.sub(pattern, repl, txt)
with open('file2.txt', 'w') as f:
f.write(result)

How to create an index using Whoosh

I am trying to use Whoosh for text searching for the first time. I want to search for documents containing the word "XML". But because I am new to Whoosh, I just wrote a program that search for a word from a document. Where the document is a text file (myRoko.txt)
import os, os.path
from whoosh import index
from whoosh.index import open_dir
from whoosh.fields import Schema, ID, TEXT
from whoosh.qparser import QueryParser
from whoosh.query import *
if not os.path.exists("indexdir3"):
os.mkdir("indexdir3")
schema = Schema(name=ID(stored=True), content=TEXT)
ix = index.create_in("indexdir3", schema)
writer = ix.writer()
path = "myRoko.txt"
with open(path, "r") as f:
content = f.read()
f.close()
writer.add_document(name=path, content= content)
writer.commit()
ix = open_dir("indexdir3")
query_b = QueryParser('content', ix.schema).parse('XML')
with ix.searcher() as srch:
res_b = srch.search(query_b)
print res_b[0]
The above code is supposed to print the document that contain the word "XML". However the code return the following error:
raise ValueError("%r is not unicode or sequence" % value)
ValueError: 'A large number of documents are now represented and stored
as XML document on the web. Thus ................
What could be the cause of this error?
You have a Unicode problem. You should pass unicode strings to the indexer. For that, you need to open the text file as unicode:
import codecs
with codecs.open(path, "r","utf-8") as f:
content = f.read()
and use unicode string for file name:
path = u"myRoko.txt"
After fixes I got this result:
<Hit {'name': u'myRoko.txt'}>
writer.add_document(name=unicode(path), content=unicode(content))
It has to be UNICODE

Categories

Resources