I am trying to pass a method attribute local_filename of get_img_url to ImgurDownload but getting the error
"AttributeError: ImgurDownload instance has no attribute 'local_filename'"
Is it possible to make local_filename available in the global scope so other methods can access it?
import re, os, glob, sys
import requests
from bs4 import BeautifulSoup
import pdb
import pprint
# imgur url pattern
imgurUrlPattern = re.compile(r'(http://i.imgur.com/(.*))(\?.*)?')
class ImgurDownload():
#local_filename = None
def __init__(self, link_url, target_subreddit, submissionid):
self.link_url = link_url
self.target_subreddit = target_subreddit
self.submissionid = submissionid
def download_image(self):
response = requests.get("{}".format(self.link_url))
if response.status_code == 200:
#pdb.set_trace()
#------------->2. local_filename is what i want to get from get_img_url <------------
print('Downloading %s...' % self.local_filename)
with open(self.local_filename, 'wb') as fo:
for chunk in response.iter_content(4096):
fo.write(chunk)
def get_img_url(self):
if "imgur.com/" not in self.link_url:
pass # skip non-imgur submissions
if len(glob.glob('reddit_%s_%s_*' % (self.target_subreddit, self.submissionid))) > 0:
pass # we've already downloaded files for this reddit submission
if 'http://i.imgur.com/' in self.link_url:
# The URL is a direct link to the image.
mo = imgurUrlPattern.search(self.link_url) # using regex here instead of BeautifulSoup because we are pasing a url, not html
imgurFilename = mo.group(2)
if '?' in imgurFilename:
# The regex doesn't catch a "?" at the end of the filename, so we remove it here.
self.imgurFilename = imgurFilename[:imgurFilename.find('?')]
#--------------------> 1. I want this instance var to be passed on to ^ download_image method <~-----------------------
self.local_filename = 'reddit_%s_%s_album_None_imgur_%s' % (self.target_subreddit, self.submissionid, imgurFilename)
self.download_image()
def print_self_dict(self):
pprint.pprint(self.__dict__)
sample = "http://i.imgur.com/yhemck6.jpg"
x = ImgurDownload('http://i.imgur.com/C5uIWlD.jpg','brogress','2cgwwn')
print x.download_image()
#x.print_self_dict()
self.local_filename is only defined if a) you call get_img_url first and b) the link contains imgur.com. In your example, you haven't done a).
When self.local_filename variable is accessed in download_image() method python will check for the local_filename instance variable. Yet because this variable was not set as an instance variable before it was accessed in download_image(), python will throw you an AttributeError.
We have assure that this variable is instantiated before accessing it. The best approach is to instantiate it inside the init(). Then when it comes to download_image(), it haves an initialized local_filename variable appended in to self.
Related
This is a code with Web crawler.
I'm a beginer in learning python.So i don't know how to solve.
It seems wrong with search()
# -*- coding:utf-8 -*-
import urllib,urllib2,re
class BDTB:
def __init__(self,baseUrl,seeLZ):
self.baseUrl = baseUrl
self.seeLZ = '?see_lz' + str(seeLZ)
def getPage(self,pageNum):
try:
url = self.baseUrl + self.seeLZ + '&pn=' + str(pageNum)
request = urllib2.Request(url)
response = urllib2.urlopen(request)
#print response.read().decode('utf-8')
return response
except urllib2.URLError,e:
if hasattr(e,'reason'):
print u'连接百度贴吧失败,错误原因',e.reason
return None
def getTitle(self):
page = self.getPage(1)
pattern = re.compile('<h3 class.*?px">(.*?)</h3>',re.S)
result = re.search(pattern,page)
if result:
print result.group(1)
return result.group(1).strip()
else:
return None
baseURL = 'http://tieba.baidu.com/p/4095047339'
bdtb = BDTB(baseURL,1)
bdtb.getTitle()
This will raise a TypeError: expected string or buffer because you are passing the object returned from urllib2.urlopen(request) to re.search() when it requires an str.
If you change the return value from:
return responce # returns the object
to one that returns the text contained in the request:
return responce.read() # returns the text contained in the responce
Your script works and after executing it returns:
广告兼职及二手物品交易集中贴
Additionally, since you're working with Python 2.x you might want to change you object from class BDTB: to class BDTB(object) in order to use new style classes.
Im writing some code to parse an XML file. Im just wondering if someone could explain why this isn't working. If I put link itself into urllib.urlopen(), it does not seem to make it to that url. However, when I put "http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?max- results=50&time=today" inside urllib.urlopen(), it works. Does it need to be a string and not a variable or is there a way around it?
import urllib
from bs4 import BeautifulSoup
class Uel(object):
def __init__(self, link):
self.content_data = []
self.num_likes = []
self.num_dislikes = []
self.favoritecount = []
self.view_count = []
self.link = link
self.web_obj = urllib.urlopen(link)
self.file = open('youtubequery.txt', 'w+')
self.file.write(str(self.web_obj))
for i in self.web_obj:
self.file.write(i)
with open("youtubequery.txt", "r") as myfile:
self.file_2=myfile.read()
self.soup = BeautifulSoup(self.file_2)
for link in self.soup.find_all("content"):
self.content_data.append(str(link.get("src")))
for stat in self.soup.find_all("yt:statistics"):
self.favoritecount.append(str(stat.get("favoritecount")))
for views in self.soup.find_all("yt:statistics"):
self.view_count.append(str(views.get("viewcount")))
for numlikes in self.soup.find_all("yt:rating"):
self.num_likes.append(str(numlikes.get("numlikes")))
for numdislikes in self.soup.find_all("yt:rating"):
self.num_dislikes.append(str(numdislikes.get("numdislikes")))
def __str__(self):
return str(self.content_data),str(self.num_likes), str(self.num_dislikes)
link = "http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?max- results=50&time=5"
data = Uel(link)
print data.__str__()
In the code you've presented, you are using this url:
http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?max- results=50&time=5
a request to which produces:
Invalid value for time parameter: 5
But, in the question itself, you've mentioned the following URL:
http://gdata.youtube.com/feeds/api/standardfeeds/top_rated?max- results=50&time=today
which has time=today. The code with this URL works for me.
I've saved a document in the blobstore, and am trying to retrieve it within a handler (which handles a task). I've had a read of the appengine documentation regarding the how to use blobstore, but am struggling to get it to work for my case. I've tried the following within the handler but cannot seem to have the object returned as the saved file (e.g. .pdf or .txt)
class SendDocuments(webapp2.RequestHandler):
def post(self):
document_key = self.request.get("document_key")
document_key = Key(str(document_key))
the_document = DocumentsModel.all().filter("__key__ =", document_key).get()
file_data = blobstore.BlobInfo.get(str(the_document.blobstore_key)) # returns a blobinfo object
file_data.open() # returns a blobreader object
file_data.open().read() # returns a string
I've also tried
class ServeSavedDocument(blobstore_handlers.BlobstoreDownloadHandler):
def get(self, blob_key):
self.send_blob(blob_key, save_as=True)
return
class SendDocuments(webapp2.RequestHandler):
def post(self):
document_key = self.request.get("document_key")
document_key = Key(str(document_key))
the_document = DocumentsModel.all().filter("__key__ =", document_key).get()
grab_blob = ServeSavedDocument()
file_data = grab_blob.get(self, str(the_document.blobstore_key))
But the call to ServeSavedDocument fails with
'NoneType' object has no attribute 'headers'
I've had a look at the files api but the only example that's not saving the file simply seems to return the blob key i.e.
blob_key = files.blobstore.get_blob_key(file_name)
What is the best way to grab a saved file in the blobstore from within a handler?
EDIT 1:
I'm trying to retrieve the txt file or pdf file from the blobstore in a format / state that can be encoded as a file in a post request using the following code
from google.appengine.api import urlfetch
from poster.encode import multipart_encode
# assuming here that file_data is the file object
payload = {}
payload['user_id'] = '1234123412341234'
payload['test_file'] = MultipartParam('test_file', filename=file_data.filename,
filetype=file_data.type,
fileobj=file_data.file)
data,headers= multipart_encode(payload)
send_url = "http://127.0.0.0/"
t = urlfetch.fetch(url=send_url, payload="".join(data), method=urlfetch.POST, headers=headers)
Okay, after a lot of messing around I"ve found that this works! The key was to simply call open() on the blobinfo object.
class SendDocuments(webapp2.RequestHandler):
def post(self):
document_key = self.request.get("document_key")
document_key = Key(str(document_key))
the_document = DocumentsModel.all().filter("__key__ =", document_key).get()
file_data = blobstore.BlobInfo.get(str(the_document.blobstore_key))
payload = {}
payload['user_id'] = '1234123412341234'
payload['test_file'] = MultipartParam('the_file', filename="something",
filetype=file_data.content_type,
fileobj=file_data.open())
Have a look at the BlobReader class, I think that's what you are looking for:
https://developers.google.com/appengine/docs/python/blobstore/blobreaderclass
It allows file-like read access to blobstore data:
# blob_key = ..
# Instantiate a BlobReader for a given Blobstore value.
blob_reader = blobstore.BlobReader(blob_key)
# Read the entire value into memory. This may take a while depending
# on the size of the value and the size of the read buffer, and is not
# recommended for large values.
value = blob_reader.read()
Try as this,specify the file name and it's type in the send_blob() as :
def mime_type(filename):
return guess_type(filename)[0]
class ServeSavedDocument(blobstore_handlers.BlobstoreDownloadHandler):
def get(self):
blob_key = self.request.get("blob_key")
if blob_key:
blob_info = blobstore.get(blob_key)
if blob_info:
save_as1 = blob_info.filename
type1=mime_type(blob_info.filename)
self.send_blob(blob_info,content_type=type1,save_as=save_as1)
actually I have a code named rdf.py that generates rdf code ..what I want to do is to directly move that file in 4store.. I have stored the entire code in a variable and want to directly pass that variable to 4store.. is it possible?
the code of rdf.py is below.
rdf_code contains the entire rdf code that is generated
import rdflib
from rdflib.events import Dispatcher, Event
from rdflib.graph import ConjunctiveGraph as Graph
from rdflib import plugin
from rdflib.store import Store, NO_STORE, VALID_STORE
from rdflib.namespace import Namespace
from rdflib.term import Literal
from rdflib.term import URIRef
from tempfile import mkdtemp
from gstudio.models import *
from objectapp.models import *
from reversion.models import Version
from optparse import make_option
def get_nodetype(name):
"""
returns the model the id belongs to.
"""
try:
"""
ALGO: get object id, go to version model, return for the given id.
"""
node = NID.objects.get(title=str(name))
# Retrieving only the relevant tupleset for the versioned objects
vrs = Version.objects.filter(type=0 , object_id=node.id)
# Returned value is a list, so splice it .
vrs = vrs[0]
except Error:
return "The item was not found."
return vrs.object._meta.module_name
def rdf_description(name, notation='xml' ):
"""
Function takes title of node, and rdf notation.
"""
valid_formats = ["xml", "n3", "ntriples", "trix"]
default_graph_uri = "http://gstudio.gnowledge.org/rdfstore"
configString = "/var/tmp/rdfstore"
# Get the Sleepycat plugin.
store = plugin.get('IOMemory', Store)('rdfstore')
# Open previously created store, or create it if it doesn't exist yet
graph = Graph(store="IOMemory",
identifier = URIRef(default_graph_uri))
path = mkdtemp()
rt = graph.open(path, create=False)
if rt == NO_STORE:
#There is no underlying Sleepycat infrastructure, create it
graph.open(path, create=True)
else:
assert rt == VALID_STORE, "The underlying store is corrupt"
# Now we'll add some triples to the graph & commit the changes
# rdflib = Namespace('http://sbox.gnowledge.org/gstudio/')
graph.bind("gstudio", "http://gnowledge.org/")
exclusion_fields = ["id", "rght", "node_ptr_id", "image", "lft", "_state", "_altnames_cache", "_tags_cache", "nid_ptr_id", "_mptt_cached_fields"]
node_type=get_nodetype(name)
if (node_type=='gbobject'):
node=Gbobject.objects.get(title=name)
elif (node_type=='objecttype'):
node=Objecttype.objects.get(title=name)
elif (node_type=='metatype'):
node=Metatype.objects.get(title=name)
elif (node_type=='attributetype'):
node=Attributetype.objects.get(title=name)
elif (node_type=='relationtype'):
node=Relationtype.objects.get(title=name)
elif (node_type=='attribute'):
node=Attribute.objects.get(title=name)
elif (node_type=='complement'):
node=Complement.objects.get(title=name)
elif (node_type=='union'):
node=Union.objects.get(title=name)
elif (node_type=='intersection'):
node=Intersection.objects.get(title=name)
elif (node_type=='expression'):
node=Expression.objects.get(title=name)
elif (node_type=='processtype'):
node=Processtype.objects.get(title=name)
elif (node_type=='systemtype'):
node=Systemtype.objects.get(title=name)
node_url=node.get_absolute_url()
site_add= node.sites.all()
a = site_add[0]
host_name =a.name
#host_name=name
link='http://'
#Concatenating the above variables will give the url address.
url_add=link+host_name+node_url
rdflib = Namespace(url_add)
# node=Objecttype.objects.get(title=name)
node_dict=node.__dict__
subject=str(node_dict['id'])
for key in node_dict:
if key not in exclusion_fields:
predicate=str(key)
pobject=str(node_dict[predicate])
graph.add((rdflib[subject], rdflib[predicate], Literal(pobject)))
rdf_code= graph.serialize(format=notation)
# print out all the triples in the graph
for subject, predicate, object in graph:
print subject, predicate, object
graph.commit()
print rdf_code
graph.close()
can I directly pass the rdf_code to 4store...if yes then how?
The simplest way to do this is to transform that graph into ntriples and send it to http://yourhost:port/data/GRAPH_URI. If you do an HTTP POST then the triples will be appended to the existing graph represented by GRAPH_URI. If you do a HTTP PUT then the current graph will be replaced. If the graph does not exist then it will be created no matter if you POST or PUT.
Taking this function as example:
def assert4s(data,epr,graph,contenttype,flush=False):
try:
params = urllib.urlencode({'graph': graph,
'data': data,
'mime-type' : contenttype })
opener = urllib2.build_opener(urllib2.HTTPHandler)
request = urllib2.Request(epr,params)
request.get_method = lambda: ('PUT' if flush else 'POST')
url = opener.open(request)
return url.read()
except Exception, e:
raise e
If you had the following data:
triples = """<a> <b> <c> .
<d> <e> <f> .
"""
You can do the following call:
assert4s(triples,
"http://yourhost:port/data/",
"http://some.org/graph/id",
"application/x-turtle")
Edit
My previous answer assumed you were using the 4s-httpd server. You can start the SPARQL server in 4store with the following command 4s-httpd -p PORT kb_name. Once you have this running, you can use the following services for:
http://localhost:port/sparql/ to submit queries
http://localhost:port/data/ to PUT or POST data files.
http://localhost:port/update/ to submit SPARQL updates queries.
The 4store SPARQLServer documentation is quite complete.
I have a CSV with keywords in one column and the number of impressions in a second column.
I'd like to provide the keywords in a url (while looping) and for the Google language api to return what type of language was the keyword in.
I have it working manually. If I enter (with the correct api key):
http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=merde
I get:
{"responseData": {"language":"fr","isReliable":false,"confidence":6.213709E-4}, "responseDetails": null, "responseStatus": 200}
which is correct, 'merde' is French.
so far I have this code but I keep getting server unreachable errors:
import time
import csv
from operator import itemgetter
import sys
import fileinput
import urllib2
import json
E_OPERATION_ERROR = 1
E_INVALID_PARAMS = 2
#not working
def parse_result(result):
"""Parse a JSONP result string and return a list of terms"""
# Deserialize JSON to Python objects
result_object = json.loads(result)
#Get the rows in the table, then get the second column's value
# for each row
return row in result_object
#not working
def retrieve_terms(seedterm):
print(seedterm)
"""Retrieves and parses data and returns a list of terms"""
url_template = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&key=myapikey&q=%(seed)s'
url = url_template % {"seed": seedterm}
try:
with urllib2.urlopen(url) as data:
data = perform_request(seedterm)
result = data.read()
except:
sys.stderr.write('%s\n' % 'Could not request data from server')
exit(E_OPERATION_ERROR)
#terms = parse_result(result)
#print terms
print result
def main(argv):
filename = argv[1]
csvfile = open(filename, 'r')
csvreader = csv.DictReader(csvfile)
rows = []
for row in csvreader:
rows.append(row)
sortedrows = sorted(rows, key=itemgetter('impressions'), reverse = True)
keys = sortedrows[0].keys()
for item in sortedrows:
retrieve_terms(item['keywords'])
try:
outputfile = open('Output_%s.csv' % (filename),'w')
except IOError:
print("The file is active in another program - close it first!")
sys.exit()
dict_writer = csv.DictWriter(outputfile, keys, lineterminator='\n')
dict_writer.writer.writerow(keys)
dict_writer.writerows(sortedrows)
outputfile.close()
print("File is Done!! Check your folder")
if __name__ == '__main__':
start_time = time.clock()
main(sys.argv)
print("\n")
print time.clock() - start_time, "seconds for script time"
Any idea how to finish the code so that it will work? Thank you!
Try to add referrer, userip as described in the docs:
An area to pay special attention to
relates to correctly identifying
yourself in your requests.
Applications MUST always include a
valid and accurate http referer header
in their requests. In addition, we
ask, but do not require, that each
request contains a valid API Key. By
providing a key, your application
provides us with a secondary
identification mechanism that is
useful should we need to contact you
in order to correct any problems. Read
more about the usefulness of having an
API key
Developers are also encouraged to make
use of the userip parameter (see
below) to supply the IP address of the
end-user on whose behalf you are
making the API request. Doing so will
help distinguish this legitimate
server-side traffic from traffic which
doesn't come from an end-user.
Here's an example based on the answer to the question "access to google with python":
#!/usr/bin/python
# -*- coding: utf-8 -*-
import json
import urllib, urllib2
from pprint import pprint
api_key, userip = None, None
query = {'q' : 'матрёшка'}
referrer = "https://stackoverflow.com/q/4309599/4279"
if userip:
query.update(userip=userip)
if api_key:
query.update(key=api_key)
url = 'http://ajax.googleapis.com/ajax/services/language/detect?v=1.0&%s' %(
urllib.urlencode(query))
request = urllib2.Request(url, headers=dict(Referer=referrer))
json_data = json.load(urllib2.urlopen(request))
pprint(json_data['responseData'])
Output
{u'confidence': 0.070496580000000003, u'isReliable': False, u'language': u'ru'}
Another issue might be that seedterm is not properly quoted:
if isinstance(seedterm, unicode):
value = seedterm
else: # bytes
value = seedterm.decode(put_encoding_here)
url = 'http://...q=%s' % urllib.quote_plus(value.encode('utf-8'))