I've been trying to make my python code to fill a form in word with data that i scraped off the Internet. I wrote the data in a txt file and are now trying to fill the word file with this code:
import zipfile
import os
import tempfile
import shutil
import codecs
def getXml(docxFilename,ReplaceText):
zip = zipfile.ZipFile(open(docxFilename,"rb"))
xmlString= zip.read("word/document.xml")
for key in ReplaceText.keys():
xmlString = xmlString.replace(str(key), str(ReplaceText.get(key)))
return xmlString
def createNewDocx(originalDocx,xmlString,newFilename):
tmpDir = tempfile.mkdtemp()
zip = zipfile.ZipFile(open(originalDocx,"rb"))
zip.extractall(tmpDir)
#3tmpDir=tmpDir.decode("utf-8")
with open(os.path.join(tmpDir,"word/document.xml"),"w") as f:
f.write(xmlString)
filenames = zip.namelist()
zipCopyFilename = newFilename
with zipfile.ZipFile(zipCopyFilename,"w") as docx:
for filename in filenames:
docx.write(os.path.join(tmpDir,filename),filename)
shutil.rmtree(tmpDir)
f=open('test.txt', 'r',)
text=f.read().split("\n")
print text[1]
Pavarde = text[1]
Replace = {"PAVARDE1":Pavarde}
createNewDocx("test.docx",getXml("test.docx",Replace),"test2.docx")
The file is created but I cant open it.
I get the following error:
Illegal xlm character
My guess would be that theres something with the encoding but I cant find a solution.
Related
I have the following code (comments explain what is occuring):
import os
from io import StringIO
from PyPDF2 import PdfFileReader
# Path to the directory containing the PDF files
pdf_dir = '/path/to/pdf/files'
# Iterate over the files in the directory
for filename in os.listdir(pdf_dir):
# Check if the file is a PDF file
if filename.endswith('.pdf'):
# Construct the full path to the file
filepath = os.path.join(pdf_dir, filename)
# Open the PDF file and read its contents
with open(filepath, 'rb') as f:
pdf = PdfFileReader(f)
# Extract the text from the PDF file
text = ''
for page in pdf.pages:
text += page.extractText()
# Construct the name of the output text file
txt_filename = filename[:-4] + '.txt'
# Write the text to the output file
with open(txt_filename, 'w') as f:
f.write(text)
When I run the code, it produces a Xref table not zero-indexed. ID numbers for objects will be corrected warning. It is not a hard error, but it makes me wonder if there's a different way I should be doing this.
Thanks for any suggestions.
I have over 200 scraped files in json format and I want to analyse them. I can open them individually, but would like to loop through to save time as I will be doing this a lot.
Can open each file but want to be able to do a loop in some format
e.g.
with codecs.open('c:\\project\\input*.json','r','utf-8') as f:
where '*' is a number.....
import codecs, json, csv, re
#read a json file downloaded with twitterscraper
with codecs.open('c:\\project\\input1.json','r','utf-8') as f:
tweets = json.load(f,encoding='utf-b')
Just put your files into a folder and then loop through the files in the folder like so.
import codecs
import json
import csv
import re
import os
files = []
for file in os.listdir("/mydir"):
if file.endswith(".json"):
files.append(os.path.join("/mydir", file))
for file in files:
with codecs.open(file,'r','utf-8') as f:
tweets = json.load(f,encoding='utf-b')
Add, and use, glob to iterate over files with certain file pattern.
import glob
import codecs
import json
# ... more packages here
for file in glob.glob('c:\\project\\input*.json'):
with codecs.open(file, 'r','utf-8') as f:
tweets = json.load(f, encoding='utf-b')
#... whatever you do next with `tweets`
BTW: utf-b instead of utf-8?
Attempting to use a code to pull all .xml file names from a particular directory in a repo, then fliter out all the files which contain a certain key word. The part of the code which pulls the file names and puts it into the first text file works properly. The second part which takes the first text file and filters it into a second text file does not work as anticipated. I am not recieving an error or anything but the array of lines is empty which is odd because the text file is not. I am wondering if anyone sees anything obvious that I am missing I have been at this for a long time so its easy to miss simple things any help would be appreciated. I have looked into other examples and the way they did it is similar to mine logically just not sure where I went wrong. Thank you in advance
This is the code:
#!/usr/bin/python
import glob
import re
import os
import fnmatch
from pprint import pprint
import xml.etree.ElementTree as ET
import cPickle as pickle
#open a text output file
text_file = open("TestCases.txt", "w")
matches = []
#initialize the array called matches with all the xml files in the selected directories (bbap, bbsc, bbtest, and bbrtm
for bbdir in ['bbap/nr', 'bbsc/nr','bbtest/nr', 'bbrtm/nr']:
for root, dirnames,filenames in os.walk('/repo/bob/ebb/'+bbdir):
for filename in fnmatch.filter(filenames, '*.xml'):
matches.append(os.path.join(root,filename))
#for each listing in matches test it against the desired filter to achieve the wanted tests
for each_xml in matches:
if each_xml.find('dlL1NrMdbfUPe') != -1:
tree = ET.parse(each_xml)
root = tree.getroot()
for child in root:
for test_suite in child:
for test_case in test_suite:
text_file.write(pickle.dumps(test_case.attrib))
#modify the text so it is easy to read
with open("TestCases.txt") as f:
with open("Formatted_File", "w") as f1:
for line in f:
if "DLL1NRMDBFUPE" in line:
f1.write(line)
Just As I had anticipated, the error was one made from looking at code for too long. I simply forgot to close text_file before opening f.
Fix:
import glob
import re
import os
import fnmatch
from pprint import pprint
import xml.etree.ElementTree as ET
import cPickle as pickle
#open a text output file
text_file = open("TestCases.txt", "w")
matches = []
#initialize the array called matches with all the xml files in the selected directories (bbap, bbsc, bbtest, and bbrtm
for bbdir in ['bbap/nr', 'bbsc/nr','bbtest/nr', 'bbrtm/nr']:
for root, dirnames,filenames in os.walk('/repo/bob/ebb/'+bbdir):
for filename in fnmatch.filter(filenames, '*.xml'):
matches.append(os.path.join(root,filename))
#for each listing in matches test it against the desired filter to achieve the wanted tests
for each_xml in matches:
if each_xml.find('dlL1NrMdbfUPe') != -1:
tree = ET.parse(each_xml)
root = tree.getroot()
for child in root:
for test_suite in child:
for test_case in test_suite:
text_file.write(pickle.dumps(test_case.attrib))
**text_file.close()**
#modify the text so it is easy to read
with open("TestCases.txt") as f:
with open("Formatted_File", "w") as f1:
for line in f:
if "DLL1NRMDBFUPE" in line:
f1.write(line)
f.close()
f1.close()
I have a script that gets all of the .zip files from a folder, then one by one, opens the zip file, loads the content of the JSON file inside and imports this to MongoDB.
The error I am getting is the JSON object must be str, bytes or bytearray, not 'TextIOWrapper'
The code is:
import json
import logging
import logging.handlers
import os
from logging.config import fileConfig
from pymongo import MongoClient
def import_json():
try:
client = MongoClient('5.57.62.97', 27017)
db = client['vuln_sets']
coll = db['vulnerabilities']
basepath = os.path.dirname(__file__)
filepath = os.path.abspath(os.path.join(basepath, ".."))
archive_filepath = filepath + '/vuln_files/'
filedir = os.chdir(archive_filepath)
for item in os.listdir(filedir):
if item.endswith('.json'):
file_name = os.path.abspath(item)
fp = open(file_name, 'r')
json_data = json.loads(fp)
for vuln in json_data:
print(vuln)
coll.insert(vuln)
os.remove(file_name)
except Exception as e:
logging.exception(e)
I can get this working to use a single file but not multiple, i.e. to do one file I wrote:
from zipfile import ZipFile
import json
import pymongo
archive = ZipFile("vulners_collections/cve.zip")
archived_file = archive.open(archive.namelist()[0])
archive_content = archived_file.read()
archived_file.close()
connection = pymongo.MongoClient("mongodb://localhost")
db=connection.vulnerability
vuln1 = db.vulnerability_collection
vulners_objects = json.loads(archive_content)
for item in vulners_objects:
vuln1.insert(item)
From my comment above:
I have no experience with glob, but from skimming the doc I get the impression your archive_files is a simple list of file-paths as strings, correct? You can not perform actions like .open on string (thus your error), so try changing your code to this:
...
archive_filepath = filepath + '/vuln_files/'
archive_files = glob.glob(archive_filepath + "/*.zip")
for file in archive_files:
with open(file, "r") as currentFile:
file_content = currentFile.read()
vuln_content = json.loads(file_content)
for item in vuln_content:
coll.insert(item)
...
file is NOT a file object or anything but just a simple string. So you cant perform methods on it that are not supported by string.
You are redefining your iterator by setting it to the result of the namelist method. You need a for loop within the for to go through the contents of the zip file and of course a new iterator variable.
Isn't file.close wrong and the correct call is file.close().
U can use json.load() to load file directly, instead of json.loads()
fp = open(file_name, 'r')
json_data = json.load(fp)
fp.close()
I am new to python and I am using following code to pull output as sentiment analysis:
import json
from watson_developer_cloud import ToneAnalyzerV3Beta
import urllib.request
import codecs
import csv
import os
import re
import sys
import collections
import glob
ipath = 'C:/TEMP/' # input folder
opath = 'C:/TEMP/matrix/' # output folder
reader = codecs.getreader("utf-8")
tone_analyzer = ToneAnalyzerV3Beta(
url='https://gateway.watsonplatform.net/tone-analyzer/api',
username='ABCID',
password='ABCPASS',
version='2016-02-11')
path = 'C:/TEMP/*.txt'
file = glob.glob(path)
text = file.read()
data=tone_analyzer.tone(text='text')
for cat in data['document_tone']['tone_categories']:
print('Category:', cat['category_name'])
for tone in cat['tones']:
print('-', tone['tone_name'],tone['score'])
#create file
In the above code all I am trying to do is to read the file and do sentiment analysis all the text file stored in C:/TEMP folder but I keep getting and error :'list' object has no attribute 'read'
Not sure where I am going wrong and I would really appreciate any help with this one. Also, is there a way i can write the output to a CSV file so if I am reading the file
ABC.txt and I create a output CSV file called ABC.csv with output values.
Thank You
glob returns a list of files, you need to iterate over the list, open each file and then call .read on the file object:
files = glob.glob(path)
# iterate over the list getting each file
for fle in files:
# open the file and then call .read() to get the text
with open(fle) as f:
text = f.read()
Not sure what it is exactly you want to write but the csv lib will do it:
from csv import writer
files = glob.glob(path)
# iterate over the list getting each file
for fle in files:
# open the file and then call .read() to get the text
with open(fle) as f, open("{}.csv".format(fle.rsplit(".", 1)[1]),"w") as out:
text = f.read()
wr = writer(out)
data = tone_analyzer.tone(text='text')
wr.writerow(["some", "column" ,"names"]) # write the col names
Then call writerow passing a list of whatever you want to write for each row.