Cherrypy and Parsing XML Data from multiple files - python

So this is sort of a piggy-back post of another question I had. I've successfully pulled data from multiple xml files and am able to get the data to display within the terminal using the print function, but when I try to use the return function to show the data in the browser, I only get the data from the first file. Any ideas on why I only get data from the first file rather than all of them? Thanks!
from xml.dom.minidom import parse, parseString
import os, glob, re
import cherrypy
class Root(object):
def index(self):
path = 'C:\Vestigo\XML'
TOTALXML = len(glob.glob(os.path.join(path, '*.xml')))
print TOTALXML
i = 0
for XMLFile in glob.glob(os.path.join(path, '*.xml')):
xmldoc = parse(XMLFile)
order_number = xmldoc.getElementsByTagName('Extrinsic')[0].firstChild.data
order_name = xmldoc.getElementsByTagName('DeliverTo')[0].firstChild.data
street1 = xmldoc.getElementsByTagName('Street1')[0].firstChild.data
state = xmldoc.getElementsByTagName('State')[0].firstChild.data
zip_code = xmldoc.getElementsByTagName('PostalCode')[0].firstChild.data
OUTPUTi = order_number+' '+order_name+' '+street1+' '+state+' '+zip_code
i += 1
print OUTPUTi
return (OUTPUTi, """<br><br>Quit""")
index.exposed = True
def exit(self):
raise SystemExit(0)
exit.exposed = True
def start():
import webbrowser
cherrypy.tree.mount(Root(), '/')
cherrypy.engine.start_with_callback(
webbrowser.open,
('http://localhost:8080/',),
)
cherrypy.engine.block()
if __name__=='__main__':
start()

You are not collecting the data anywhere; you store everything in a variable named OUTPUTi, then only return the last iteration of that variable. Python does not magically make that variable use the i counter.
Use a list to collect the strings:
TOTALXML = len(glob.glob(os.path.join(path, '*.xml')))
print TOTALXML
OUTPUT = []
for XMLFile in glob.glob(os.path.join(path, '*.xml')):
xmldoc = parse(XMLFile)
order_number = xmldoc.getElementsByTagName('Extrinsic')[0].firstChild.data
order_name = xmldoc.getElementsByTagName('DeliverTo')[0].firstChild.data
street1 = xmldoc.getElementsByTagName('Street1')[0].firstChild.data
state = xmldoc.getElementsByTagName('State')[0].firstChild.data
zip_code = xmldoc.getElementsByTagName('PostalCode')[0].firstChild.data
OUTPUT.append(order_number+' '+order_name+' '+street1+' '+state+' '+zip_code)
print OUTPUT[-1]
OUTPUT = ''.join(OUTPUT)
return (OUTPUT, """<br><br>Quit""")

Related

Iteratively process large wikipedia dump

I want to parse a large wikipedia dump iteratively. I found a tutorial for this here: https://towardsdatascience.com/wikipedia-data-science-working-with-the-worlds-largest-encyclopedia-c08efbac5f5c
However, when I want to read in the data like this:
data_path = 'C:\\Users\\Me\\datasets\\dewiki-latest-pages-articles1.xml-p1p262468.bz2'
import xml.sax
class WikiXmlHandler(xml.sax.handler.ContentHandler):
"""Content handler for Wiki XML data using SAX"""
def __init__(self):
xml.sax.handler.ContentHandler.__init__(self)
self._buffer = None
self._values = {}
self._current_tag = None
self._pages = []
def characters(self, content):
"""Characters between opening and closing tags"""
if self._current_tag:
self._buffer.append(content)
def startElement(self, name, attrs):
"""Opening tag of element"""
if name in ('title', 'text'):
self._current_tag = name
self._buffer = []
def endElement(self, name):
"""Closing tag of element"""
if name == self._current_tag:
self._values[name] = ' '.join(self._buffer)
if name == 'page':
self._pages.append((self._values['title'], self._values['text']))
# Object for handling xml
handler = WikiXmlHandler()
# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)
# Iteratively process file
for line in subprocess.Popen(['bzcat'],
stdin = open(data_path),
stdout = subprocess.PIPE,shell=True).stdout:
parser.feed(line)
# Stop when 3 articles have been found
if len(handler._pages) > 3:
break
it seems like nothing happens. The handler._pages list is empty. This is where the parsed articles should be stored. I also added shell=True because otherwise I get the error message FileNotFoundError: [WinError 2].
I never worked with subprocesses in python so I don't know what the problem might be.
I also tried to specify the data_path differently (with / and //).
Thank you in advance.

ValueError: stat: path too long for Windows on Jupyter Notebook parsing a URL request

I am trying to parse my company Odata data to construct a proportion of late suppliers taking the CompanyName and LateDays fields.
I opened the file and converted it into a string since I found a really helpful post on how request urls with authentication, and I obtained my string text containing the whole report. The report is written in ?XML and I am using Python 3.7 in Jupyter Notebook to handle it.
I found another post that queries a XML file similar to mine using a class method, but my output is the error ValueError: stat: path too long for Windows.
How can I fix this?
Thanks!
import requests
import pandas as pd
import numpy as np
import base64
import urllib.request
request = urllib.request.Request('https://myUrl_OData')
base64string = base64.b64encode(bytes('%s:%s' % ('Myusername', 'Mypassword'),'ascii'))
request.add_header("Authorization", "Basic %s" % base64string.decode('utf-8'))
result = urllib.request.urlopen(request)
resulttext = result.read()
text = resulttext.decode(encoding='utf-8',errors='ignore')
from xml.sax import parse
from xml.sax.handler import ContentHandler
class properties(ContentHandler):
def __init__(self):
self.elements = [] # stack of elements
self.char_data = u'' # string buffer
self.current_vendor = u''
self.current_latedays = u''
def startElement(self, name, attrs):
if companyname == u'CompanyName':
self.elements.append(u'CompanyName')
if latedays == u'LateDays':
self.elements.append(u'LateDays')
def characters(self, chars):
if len(self.elements) > 0 and self.elements[-1] in [u'CompanyName', u'LateDays']:
self.char_data += chars
def endElement(self, name):
self.elements.pop() if len(self.elements) > 0 else None
if companyname == u'CompanyName':
self.current_vendor = self.char_data
self.char_data = ''
if latedays == u'LateDays':
self.current_latedays = self.char_data
self.char_data = ''
if companyname == 'CompanyName':
if self.current_latedays == u'LateDays':
print('Found:', self.current_customer)
# clear the buffers now that is finished
self.current_year = u''
self.current_customer = u''
self.char_data = u''
parse(r"\\\\?\\" + text, properties())
Your error doesn't seem to be related with XML parsing but with your OS limitations.
On a Windows-based OS, the path of a file cannot be longer than ~260 characters (ref).
Try to reduce the length of your filename, or reduce the number of nested folders leading to your data.

Copy parameters into list

I am trying to copy parameters passed into a python script to a file. Here is the parameters.
["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
I understand that there is a buffer problem and I am getting bad data into my parameters. However, I do not have control over what is being passed in. I am trying to copy, starting at the 5th parameter, the parameters into a file.
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x] + '\n')
f.close()
The result of the file is below:
P123-ND 10Q
10Q H??C
Here is what it should be:
P123-ND
10Q
How can I not include the bad data? What is happening to the spaces between the valid information and the bad information?
As requested, here is the full program:
#!/bin/python
class Argument_Indices:
PRINTER_INDEX = 0
AREA_INDEX = 1
LABEL_INDEX = 2
EMAIL_INDEX = 3
RUN_TYPE_INDEX = 4
import argparse
import json
import os
from subprocess import call
import sys
from time import strftime
def _handle_args():
''' Setup and run argpars '''
parser = argparse.ArgumentParser(description='Set environment variables for and to call Program')
parser.add_argument('time_to_run', default='NOW', choices=['NOW', 'EOP'], help='when to run the report')
parser.add_argument('arguments', nargs='+', help='the remaining command line arguments')
return parser.parse_args()
def _proces_program(arg_list):
time_stamp = strftime("%d_%b_%Y_%H_%M_%S")
printer = arg_list[Argument_Indices.PRINTER_INDEX]
area = arg_list[Argument_Indices.AREA_INDEX]
label = arg_list[Argument_Indices.LABEL_INDEX]
in_file_name = "/tmp/program{0}.inp".format(time_stamp)
os.environ['INPUT_FILE'] = in_file_name
f = open(in_file_name, 'w')
for x in range(5, len(arg_list)):
f.write(arg_list[x])
f.close()
call(['./Program.bin', printer, area, label])
os.remove(in_file_name)
def main():
''' Main Function '''
arg_list = None
args = _handle_args()
if len(args.arguments) < 1:
print('Missing name of input file')
return -1
with open(args.arguments[0]) as input_file:
arg_list = json.load(input_file)
_process_program(arg_list)
return 0
if __name__ == '__main__':
if main() != 0:
print('Program run failed')
sys.exit()
For your exact case (where you're getting duplicated parameters received with some spaces in between) this would work:
received_param_list = ["0013","1","1","\"john.dow#gmail.com\"","1","P123-ND 10Q","10Q H??C"]
arg_list = [i.split(" ")[0] for i in received_param_list]
last_param = received_param_list[-1].split()[-1]
if last_param != arg_list[-1]:
arg_list.append(last_param)
for x in range(5, len(arg_list)):
print (arg_list[x])
Although there might be another simpler way

Grab Bing Wallpaper with python3?

I wanna write a python script that grabs the bing.com wallpaper and saves it.
The urls of these wallpapers look like:
http://www.bing.com/az/hprichbg/rb/EuropeESA_DE-DE7849418832_1920x1080.jpg
http://www.bing.com/az/hprichbg/rb/CanisLupus_DE-DE11366975292_1920x1080.jpg
http://www.bing.com/az/hprichbg/rb/HouseBoats_DE-DE8695714746_1920x1080.jpg
Is there a way to find the image url of todays wallpaper automatically?
Based on a few of the useful answers in this related SO question, here's a simple Python script to fetch the Bing photo of the day:
import requests
import json
BING_URI_BASE = "http://www.bing.com"
BING_WALLPAPER_PATH = "/HPImageArchive.aspx?format=js&idx=0&n=1&mkt=en-US"
# open the Bing HPImageArchive URI and ask for a JSON response
resp = requests.get(BING_URI_BASE + BING_WALLPAPER_PATH)
if resp.status_code == 200:
json_response = json.loads(resp.content)
wallpaper_path = json_response['images'][0]['url']
filename = wallpaper_path.split('/')[-1]
wallpaper_uri = BING_URI_BASE + wallpaper_path
# open the actual wallpaper uri, and write the response as an image on the filesystem
response = requests.get(wallpaper_uri)
if resp.status_code == 200:
with open(filename, 'wb') as f:
f.write(response.content)
else:
raise ValueError("[ERROR] non-200 response from Bing server for '{}'".format(wallpaper_uri))
else:
raise ValueError("[ERROR] non-200 response from Bing server for '{}'".format(BING_URI_BASE + BING_WALLPAPER_PATH))
This will write a file such as TurtleTears_EN-US7942276596_1920x1080.jpg to the same directory where the script is executed. Of course, can tweak a whole bunch of things here, but gets the job done reasonably easily.
Grab it and save it in folder by using this Code:
import datetime
from urllib.request import urlopen, urlretrieve
from xml.dom import minidom
import os
import sys
def join_path(*args):
# Takes an list of values or multiple values and returns an valid path.
if isinstance(args[0], list):
path_list = args[0]
else:
path_list = args
val = [str(v).strip(' ') for v in path_list]
return os.path.normpath('/'.join(val))
dir_path = os.path.dirname(os.path.realpath(__file__))
save_dir = join_path(dir_path, 'images')
if not os.path.exists(save_dir):
os.makedirs(save_dir)
def set_wallpaper(pic_path):
if sys.platform.startswith('win32'):
cmd = 'REG ADD \"HKCU\Control Panel\Desktop\" /v Wallpaper /t REG_SZ /d \"%s\" /f' %pic_path
os.system(cmd)
os.system('rundll32.exe user32.dll, UpdatePerUserSystemParameters')
print('Wallpaper is set.')
elif sys.platform.startswith('linux'):
os.system(''.join(['gsettings set org.gnome.desktop.background picture-uri file://', pic_path]))
print('Wallpaper is set.')
else:
print('OS not supported.')
return
return
def download_old_wallpapers(minus_days=False):
"""Uses download_wallpaper(set_wallpaper=False) to download the last 20 wallpapers.
If minus_days is given an integer a specific day in the past will be downloaded.
"""
if minus_days:
download_wallpaper(idx=minus_days, use_wallpaper=False)
return
for i in range(0, 20): # max 20
download_wallpaper(idx=i, use_wallpaper=False)
def download_wallpaper(idx=0, use_wallpaper=True):
# Getting the XML File
try:
usock = urlopen(''.join(['http://www.bing.com/HPImageArchive.aspx?format=xml&idx=',
str(idx), '&n=1&mkt=ru-RU'])) # ru-RU, because they always have 1920x1200 resolution
except Exception as e:
print('Error while downloading #', idx, e)
return
try:
xmldoc = minidom.parse(usock)
# This is raised when there is trouble finding the image url.
except Exception as e:
print('Error while processing XML index #', idx, e)
return
# Parsing the XML File
for element in xmldoc.getElementsByTagName('url'):
url = 'http://www.bing.com' + element.firstChild.nodeValue
# Get Current Date as fileName for the downloaded Picture
now = datetime.datetime.now()
date = now - datetime.timedelta(days=int(idx))
pic_path = join_path(save_dir, ''.join([date.strftime('bing_wp_%d-%m-%Y'), '.jpg']))
if os.path.isfile(pic_path):
print('Image of', date.strftime('%d-%m-%Y'), 'already downloaded.')
if use_wallpaper:
set_wallpaper(pic_path)
return
print('Downloading: ', date.strftime('%d-%m-%Y'), 'index #', idx)
# Download and Save the Picture
# Get a higher resolution by replacing the file name
urlretrieve(url.replace('_1366x768', '_1920x1200'), pic_path)
# Set Wallpaper if wanted by user
if use_wallpaper:
set_wallpaper(pic_path)
if __name__ == "__main__":
download_wallpaper()
for number, url in enumerate(list_of_urls):
urllib.urlretrieve(url, 'Image {}.jpg'.format(number + 1))

split a pdf based on outline

i would like to use pyPdf to split a pdf file based on the outline where each destination in the outline refers to a different page within the pdf.
example outline:
main --> points to page 1
sect1 --> points to page 1
sect2 --> points to page 15
sect3 --> points to page 22
it is easy within pyPdf to iterate over each page of the document or each destination in the document's outline; however, i cannot figure out how to get the page number where the destination points.
does anybody know how to find the referencing page number for each destination in the outline?
I figured it out:
class Darrell(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
pdf = Darrell(open(PATH-TO-PDF, 'rb'))
template = '%-5s %s'
print template % ('page', 'title')
for p,t in sorted([(v,k) for k,v in pdf.getDestinationPageNumbers().iteritems()]):
print template % (p+1,t)
This is just what I was looking for. Darrell's additions to PdfFileReader should be part of PyPDF2.
I wrote a little recipe that uses PyPDF2 and sejda-console to split a PDF by bookmarks. In my case there are several Level 1 sections that I want to keep together. This script allows me to do that and give the resulting files meaningful names.
import operator
import os
import subprocess
import sys
import time
import PyPDF2 as pyPdf
# need to have sejda-console installed
# change this to point to your installation
sejda = 'C:\\sejda-console-1.0.0.M2\\bin\\sejda-console.bat'
class Darrell(pyPdf.PdfFileReader):
...
if __name__ == '__main__':
t0= time.time()
# get the name of the file to split as a command line arg
pdfname = sys.argv[1]
# open up the pdf
pdf = Darrell(open(pdfname, 'rb'))
# build list of (pagenumbers, newFileNames)
splitlist = [(1,'FrontMatter')] # Customize name of first section
template = '%-5s %s'
print template % ('Page', 'Title')
print '-'*72
for t,p in sorted(pdf.getDestinationPageNumbers().iteritems(),
key=operator.itemgetter(1)):
# Customize this to get it to split where you want
if t.startswith('Chapter') or \
t.startswith('Preface') or \
t.startswith('References'):
print template % (p+1, t)
# this customizes how files are renamed
new = t.replace('Chapter ', 'Chapter')\
.replace(': ', '-')\
.replace(': ', '-')\
.replace(' ', '_')
splitlist.append((p+1, new))
# call sejda tools and split document
call = sejda
call += ' splitbypages'
call += ' -f "%s"'%pdfname
call += ' -o ./'
call += ' -n '
call += ' '.join([str(p) for p,t in splitlist[1:]])
print '\n', call
subprocess.call(call)
print '\nsejda-console has completed.\n\n'
# rename the split files
for p,t in splitlist:
old ='./%i_'%p + pdfname
new = './' + t + '.pdf'
print 'renaming "%s"\n to "%s"...'%(old, new),
try:
os.remove(new)
except OSError:
pass
try:
os.rename(old, new)
print' succeeded.\n'
except:
print' failed.\n'
print '\ndone. Spliting took %.2f seconds'%(time.time() - t0)
Small update to #darrell class to be able to parse UTF-8 outlines, which I post as answer because comment would be hard to read.
Problem is in pyPdf.pdf.Destination.title which may be returned in two flavors:
pyPdf.generic.TextStringObject
pyPdf.generic.ByteStringObject
so that output from _setup_outline_page_ids() function returns also two different types for title object, which fails with UnicodeDecodeError if outline title contains anything then ASCII.
I added this code to solve the problem:
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
of whole class:
class PdfOutline(pyPdf.PdfFileReader):
def getDestinationPageNumbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, pyPdf.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.iteritems():
if isinstance(title, pyPdf.generic.TextStringObject):
title = title.encode('utf-8')
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
Darrell's class can be modified slightly to produce a multi-level table of contents for a pdf (in the manner of pdftoc in the pdftk toolkit.)
My modification adds one more parameter to _setup_page_id_to_num, an integer "level" which defaults to 1. Each invocation increments the level. Instead of storing just the page number in the result, we store the pair of page number and level. Appropriate modifications should be applied when using the returned result.
I am using this to implement the "PDF Hacks" browser-based page-at-a-time document viewer with a sidebar table of contents which reflects LaTeX section, subsection etc bookmarks. I am working on a shared system where pdftk can not be installed but where python is available.
A solution 10 years later for newer python and PyPDF:
from PyPDF2 import PdfReader, PdfWriter
filename = "main.pdf"
with open(filename, "rb") as f:
r = PdfReader(f)
bookmarks = list(map(lambda x: (x.title, r.get_destination_page_number(x)), r.outline))
print(bookmarks)
for i, b in enumerate(bookmarks):
begin = b[1]
end = bookmarks[i+1][1] if i < len(bookmarks) - 1 else len(r.pages)
# print(len(r.pages[begin:end]))
name = b[0] + ".pdf"
print(f"{name=}: {begin=}, {end=}")
with open(name, "wb") as f:
w = PdfWriter(f)
for p in r.pages[begin:end]:
w.add_page(p)
w.write(f)

Categories

Resources