Getting none from fields while parsing a pdf file - python

I am trying to parse a pdf file. I want to get all the values in a list or dictionary of the checkbox values. But I am getting this error.
"return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
AttributeError: 'NoneType' object has no attribute 'items'"
The code I am trying is this
from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader
def _getFields(obj, tree=None, retval=None, fileobj=None):
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
if retval is None:
retval = OrderedDict()
catalog = obj.trailer["/Root"]
# get the AcroForm tree
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._checkKids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
# Tree is a field
obj._buildField(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.getObject()
obj._buildField(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfFileReader(open(infile, 'rb'))
fields = _getFields(infile)
return OrderedDict((k, v.get('/V', '')) for k, v in fields.items())
if __name__ == '__main__':
from pprint import pprint
pdf_file_name = 'Guild.pdf'
pprint(get_form_fields(pdf_file_name))

After tracing through your code, on the 10th line it seems that catalog stores the value {'/Metadata': IndirectObject(16, 0), '/Pages': IndirectObject(1, 0), '/Type': '/Catalog'}, meaning /AcroForm is not a key in the dictionary and your function returns None.

Your _getFields explicitly returns None from first if block.
So basically that's where you could get this error from.

Related

Creating Python Active Resource object from json file

So for testing purposes, I am trying to create a python ActiveResource object from a json file (I want the object to have attributes from the json file). More specifically I am using the ShopifyResource from (https://github.com/Shopify/shopify_python_api), which extends the ActiveResource object.
I looked through the source code and found some functions that I thought would be of some use:
(https://github.com/Shopify/shopify_python_api/blob/master/shopify/base.py)
from pyactiveresource.activeresource import ActiveResource
import shopify.mixins as mixins
class ShopifyResource(ActiveResource, mixins.Countable):
_format = formats.JSONFormat
def _load_attributes_from_response(self, response):
if response.body.strip():
self._update(self.__class__.format.decode(response.body))
where _update is from ActiveResource (https://github.com/Shopify/pyactiveresource/blob/master/pyactiveresource/activeresource.py)
def _update(self, attributes):
"""Update the object with the given attributes.
Args:
attributes: A dictionary of attributes.
Returns:
None
"""
if not isinstance(attributes, dict):
return
for key, value in six.iteritems(attributes):
if isinstance(value, dict):
klass = self._find_class_for(key)
attr = klass(value)
elif isinstance(value, list):
klass = None
attr = []
for child in value:
if isinstance(child, dict):
if klass is None:
klass = self._find_class_for_collection(key)
attr.append(klass(child))
else:
attr.append(child)
else:
attr = value
# Store the actual value in the attributes dictionary
self.attributes[key] = attr
So then I tried to do the following:
order = Order()
with open("file.json")) as json_file:
x = json.loads(json_file.read())
order._update(x)
Where Order extends ShopifyResource (which extends ActiveResource). If am not mistaken x should be a dictionary, which is an approriate parameter for the _update() function.
Yet I get the following output:
raceback (most recent call last):
File "/home/vineet/Documents/project/tests/test_sync.py", line 137, in testSaveOrder1
self.getOrder()
File "/home/vineet/Documents/project/tests/tests/test_sync.py", line 113, in getOrder
order._update(x)
File "/home/vineet/Documents/project/venv/lib/python3.6/site-packages/pyactiveresource/activeresource.py", line 962, in _update
attr.append(klass(child))
File "/home/vineet/Documents/project/venv/lib/python3.6/site-packages/shopify/base.py", line 126, in __init__
prefix_options, attributes = self.__class__._split_options(attributes)
File "/home/vineet/Documents/project/venv/lib/python3.6/site-packages/pyactiveresource/activeresource.py", line 466, in _split_options
if key in cls._prefix_parameters():
File "/home/vineet/Documents/project/venv/lib/python3.6/site-packages/pyactiveresource/activeresource.py", line 720, in _prefix_parameters
for match in template.pattern.finditer(path):
TypeError: cannot use a string pattern on a bytes-like object
I even tried the following:
order._update(order._format.decode(json_file.read()))
But that didn't work since 'str' object has no attribute 'decode'.
It seems You are worried if x has correct format. Print it, and check.
Btw: And use
x = json.load(json_file)
instead of
x = json.loads(json_file.read())

Django & Python AttributeError: 'Retailer' object has no attribute

In my test file:
class TestMakeSoup(TestCase):
fixtures = ['deals_test_data.json']
def test_string_is_valid(self):
s = Retailer.objects.get(pk=1)
with open('/home/danny/PycharmProjects/askarby/deals/tests/BestBuyTest.html', 'r') as myfile:
text = myfile.read().replace('\n', '')
self.assertTrue(s.make_soup(text))
In the file it's testing:
class retailer():
'''
Retail site, drawn from database queryset object
'''
def __init__(self,r_object):
'''
Initializes retailer variables
obj -> nonetype
Precondition: r_object.currency == 3
Precondition: r_object.name != None
'''
assert len(r_object.currency) == 3, "{} must be a three-letter string (eg 'USD').".format(r_object.currency)
assert r_object.name != None, "Name must exist."
assert r_object.deal_container_css != None, "Title css must exist."
assert r_object.title_css != None, "Title css must exist."
assert r_object.price_css != None, "Price css must exist."
self.name = r_object.name
self.base_url = r_object.base_url
self.currency = r_object.currency
#dict containing css lookup values for various fields
self.css = {}
self.css['container'] = self.extract_css(r_object.deal_container_css)
self.css['title'] = self.extract_css(r_object.title_css)
self.css['product_model'] = self.extract_css(r_object.product_model_css)
self.css['price'] = self.extract_css(r_object.price_css)
self.css['old_price'] = self.extract_css(r_object.old_price_css)
self.css['brand'] = self.extract_css(r_object.brand_css)
self.css['image'] = self.extract_css(r_object.image_css)
self.css['description'] = self.extract_css(r_object.description_css)
self.css['exclude'] = self.extract_css(r_object.exclude_css)
self.css['shipping'] = self.extract_css(r_object.shipping_css)
#dict containing associated clearance urls for retailer
self.clearance_urls = self.get_clearance_urls()
#dict to house final list of deals
self.deals = {}
def __str__(self):
return self.name
def make_soup(self, text):
assert isinstance(text,str), "text must be string."
soup = bs4.BeautifulSoup(text, "html.parser")
if soup:
return soup
return False
The Retailer call refers to the Retailer model in my deals app.
I get this error:
Error
Traceback (most recent call last):
File "/home/danny/PycharmProjects/askarby/deals/tests/test_deals.py", line 101, in test_string_is_valid
self.assertTrue(s.make_soup(text))
AttributeError: 'Retailer' object has no attribute 'make_soup'
Why isn't make_soup running as a method?
The retailer class takes an object retrieved from the database. I retrieved the object from the database but didn't create a class with it.
def test_makesoup(self):
z = Retailer.objects.get(pk=1)
s = dealscan.retailer(z)
with open('/home/danny/PycharmProjects/askarby/deals/tests/BestBuyTest.html', 'r') as myfile:
text = myfile.read().replace('\n', '')
self.assertTrue(s.make_soup(text))
solves it.

skipping a json key if does not exist

I'm running the following:
for server in server_list:
for item in required_fields:
print item, eval(item)
There is a possibility that some keys may not exist, but worse it's represented on a parent key not the one I'm scanning for.
So I'm scanning the json for the following key:
server['server_management']['server_total_cost_of_ownership']['description']
Which doesn't exist but it's actually the parent that is null:
server['server_management']['server_total_cost_of_ownership']
How do I write my code to account for this? It's not giving a key error. Right now I get the following traceback:
Traceback (most recent call last):
File "C:/projects/blah/scripts/test.py", line 29, in <module>
print item, eval(item)
File "<string>", line 1, in <module>
TypeError: 'NoneType' object has no attribute '__getitem__'
Full code:
import csv
import json
import os
import requests
import sys
required_fields = ["server['server_name']","server['server_info']['asset_type']['display_name']",
"server['asset_status']['display_name']", "server['record_owner']['group_name']",
"server['server_management']['server_total_cost_of_ownership']['description']",
"server['server_management']['primary_business_owner']['name']",
"server['environment']['display_name']", "server['is_virtual']",
"server['managed_by']['display_name']", "server['server_info']['billable_ibm']",
"server['server_info']['billing_sub_type']['display_name']",
"server['server_info']['serial_number']", "server['location']['display_name']",
"server['inception_date']", "server['server_info']['decommission_date']" ]
# Query API for all servers
def get_servers_info():
servers_info = requests.get('url')
return servers_info.json()
def get_server_info(sid):
server_info = requests.get('url')
return server_info.json()
server_list = get_servers_info()
for server in server_list:
for item in required_fields:
print item, eval(item)
In fact you should avoid eval. After the json load since you know the key name, you can use a list to go deeper in the tree.
server['server_management']['primary_business_owner']['name']" => ["server_management', 'primary_business_owner', 'name']
Here a snippet for a json validation against a list of required fields.
data={
"d": {
"p":{
"r":[
"test"
]
}
},
"a": 3
}
def _get_attr(dict_, attrs):
try:
src = attrs[:]
root = attrs.pop(0)
node = dict_[root]
null = object()
for i, attr in enumerate(attrs[:]):
try:
node = node.get(attr, null)
except AttributeError:
node = null
if node is null:
# i+2 pop and last element
raise ValueError("%s not present (level %s)" % (attr, '->'.join(src[: i+2])))
return node
except KeyError:
raise ValueError("%s not present" % root)
# assume list of required field
reqs = [
["d", "p", "r"],
["d"],
["k"],
["d", "p", "r", "e"],
]
for req in reqs:
try:
_get_attr(data, req)
except ValueError as E:
print(E)
# prints
# k not present
# e not present (level d->p->r->e)
Ignoring the context of the code and not understanding the use of eval here, the way to do this is to use .get() and seed it with reasonable defaults.
For example:
server['server_management']['server_total_cost_of_ownership']['description']
Can be:
server.get('server_management', {}).get('server_total_cost_of_ownership', {}).get('description', '')
Then if any of the keys do not exist you will always get back an empty description ''.
Your problem here is totally unrelated to using eval[1]. The exception you get is the same as if the code would have been there directly. What you are running (via eval) is:
a = server['server_management']
b = a['server_total_cost_of_ownership']
c = b['description']
Yet, b is None, so resolving it to c will fail. Like a KeyError, you can also catch a TypeError:
for server in server_list:
for item in required_fields:
try:
print item, eval(item)
except TypeError:
print("Guess you're lucky you didn't include a fork bomb in your own code to eval.")
You may of course alternatively pass, print the offending item, open a browser to some page or do whatever error handling is appropriate given your input data.
[1] While not bickering around, I've made a new answer that works without eval. You can use precisely the same error handling:
for server in server_list:
for item in required_fields:
value = server
for key in parse_fields(field):
try:
value = value[key]
except TypeError:
print("Remember Kiddo: Eval is Evil!")
break
else: # for: else: triggers only if no break was issued
print item, value

error when parsing yahoo weather xml in python

I have been searching for a solution, so i am trying to get weather information for a location id in my database,it is meant to use location_id from model and store the information in weather log a model in my database this is the code and the error below:
![# -*- coding: UTF-8 -*-
import urllib
from xml.dom import minidom
from xml.dom.minidom import parse
from argparse import ArgumentParser
from pprint import pprint
from datetime import datetime
from django.db import models
from django.core.management.base import NoArgsCommand
Location = models.get_model("weatherapp", "Location")
WeatherLog = models.get_model("weatherapp", "WeatherLog")
SILENT, NORMAL, VERBOSE = 0, 1, 2
WEATHER_URL = 'http://weather.yahooapis.com/forecastrss?p=%s&u=c'
METRIC_PARAMETER = ''
WEATHER_NS = "http://xml.weather.yahoo.com/ns/rss/1.0"
def weather_for_location(location_id, options):
# taken from http://developer.yahoo.com/python/python-xml.html
# and modified a little
url = WEATHER_URL % location_id
try:
dom = minidom.parse(urllib.urlopen(url))
except Exception:
return None
# Get the units of the current feed.
yunits = dom.getElementsByTagNameNS(WEATHER_NS, 'units') \[0\]
# Get the location of the specified location code.
ylocation = dom.getElementsByTagNameNS(WEATHER_NS, 'location') \[0\]
# Get the current conditions.
ycondition = dom.getElementsByTagNameNS(WEATHER_NS, 'condition') \[0\]
forecasts = \[\]
for node in enumerate( dom.getElementsByTagNameNS(WEATHER_NS, 'forecast')):
forecasts.append({
'date': node.getAttribute('date'),
'low': node.getAttribute('low'),
'high': node.getAttribute('high'),
'condition': node.getAttribute('text')
})
return {
'current_condition': ycondition.getAttribute('text'),
'current_temp': ycondition.getAttribute('temp'),
'current_humidity': yatmosphere.getAttribute('humidity'),
'current_visibility': yatmosphere.getAttribute('visibility'),
'current_wind_speed': ywind.getAttribute('speed'),
'forecasts': forecasts,
'title': dom.getElementsByTagName('title')\[0\].firstChild.data,
'guid': dom.getElementsByTagName('guid')\[0\].firstChild.data,
}
class Command(NoArgsCommand):
help = "Aggregates data from weather feed"
def handle_noargs(self, **options):
verbosity = int(options.get('verbosity', NORMAL))
created_count = 0
for l in Location.objects.all():
weather = weather_for_location(l.location_id, options)
if verbosity > NORMAL:
pprint(weather)
timestamp_parts = map(int, weather\['guid'\].split("_")\[1:-1\])
timestamp = datetime(*timestamp_parts)
log, created = WeatherLog.objects.get_or_create(
location=l,
timestamp=timestamp,
defaults={
'temperature': weather\['current_temp'\],
'humidity': weather\['current_humidity'\],
'wind_speed': weather\['current_wind_speed'\],
'visibility': weather\['current_visibility'\],
}
)
if created:
created_count += 1
if verbosity > NORMAL:
print "New weather logs: %d" % created_count][1]
error:
> File
> "/home/temi/rapidsmstuts/myapp/weatherapp/management/commands/check_weather.py",
> line 74, in handle_noargs
> timestamp_parts = map(int, weather['guid'].split("_")[1:-1]) TypeError: 'NoneType' object has no attribute '__getitem__'
File "/home/temi/rapidsmstuts/myapp/weatherapp/management/commands/check_weather.py", line 47, in weather_for_location
'date': node.getAttribute('date'),
AttributeError: 'tuple' object has no attribute 'getAttribute'
File "/home/temi/rapidsmstuts/myapp/weatherapp/management/commands/check_weather.py", line 35, in weather_for_location
yunits = dom.getElementsByTagNameNS(WEATHER_NS, 'units') [0]
IndexError: list index out of range

How to point to value of variable

To avoid writing a long, ugly ternary a dozen times, such as p_count = root.pool_count if hasattr(root, 'pool_count') else (-1), I wrote the GetThis(el, attr, err="") function.
How do I get the function to concatenate the values of el & attr into an element, instead of attr as a literal?
test_data = ("""
<rsp stat="ok">
<group id="34427465497#N01" iconserver="1" iconfarm="1" lang="" ispoolmoderated="0" is_member="0" is_moderator="0" is_admin="0">
<name>GNEverybody</name>
<members>245</members>
<pool_count>133</pool_count>
<topic_count>106</topic_count>
<restrictions photos_ok="1" videos_ok="1" images_ok="1" screens_ok="1" art_ok="1" safe_ok="1" moderate_ok="0" restricted_ok="0" has_geo="0" />
</group>
</rsp>
""")
################
from lxml import etree
from lxml import objectify
################
def GetThis(el, attr, err=""):
if hasattr(el, attr):
return el.attr
else:
return err
################
Grp = objectify.fromstring(test_data)
root = Grp.group
gName = GetThis(root, "name", "No Name")
err_tst = GetThis(root, "not-there", "Error OK")
p_count = root.pool_count if hasattr(root, 'pool_count') else (-1)
As it is, I get the following error:
Traceback (most recent call last):
File "C:/Mirc/Python/Temp Files/test_lxml.py", line 108, in <module>
gName = GetThis(root, "name", "")
File "C:/Mirc/Python/Temp Files/test_lxml.py", line 10, in GetThis
print (el.attr)
File "lxml.objectify.pyx", line 218, in lxml.objectify.ObjectifiedElement.__getattr__ (src\lxml\lxml.objectify.c:3506)
File "lxml.objectify.pyx", line 437, in lxml.objectify._lookupChildOrRaise (src\lxml\lxml.objectify.c:5756)
AttributeError: no such child: attr
Thank you!
Your GetThis function is not needed since you can simply use getattr:
gName = getattr(root, "name", "No Name")
err_tst = getattr(root, "not-there", "Error OK")
The first argument is the object, the second is the attribute, and the third, which is optional, is a default value to return if the attribute doesn't exist (if this last part is omitted, an AttributeError is raised instead).
The two lines of code above are equivalent to these:
gName = root.name if hasattr(root, "name") else "No Name"
err_tst = root.not-there if hasattr(root, "not-there") else "Error OK"
Python has a function called getattr(foo,"bar") which return the value of the bar attr of the foo object. Just read the doc ..

Categories

Resources