I'm writing a simple script that:
Loads a big list of URLs
Get the content of each URL making concurrent HTTP requests using requests' async module
Parses the content of the page with lxml in order to check if a link is in the page
If the link is present on the page, saves some info about the page in a ZODB database
When I test the script with 4 or 5 URLs It works well, I only have the following message when the script ends:
Exception KeyError: KeyError(45989520,) in <module 'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
But when I try to check about 24000 URLs it fails toward the end of the list (when there are about 400 URLs left to check) with the following error:
Traceback (most recent call last):
File "check.py", line 95, in <module>
File "/home/alex/code/.virtualenvs/linka/local/lib/python2.7/site-packages/requests/async.py", line 83, in map
File "/home/alex/code/.virtualenvs/linka/local/lib/python2.7/site-packages/gevent-1.0b2-py2.7-linux-x86_64.egg/gevent/greenlet.py", line 405, in joinall
ImportError: No module named queue
Exception KeyError: KeyError(45989520,) in <module 'threading' from '/usr/lib/python2.7/threading.pyc'> ignored
I tried both with the version of gevent available on pypi and downloading and installing the latest version (1.0b2) from gevent repository.
I cannot understand why this happened, and why it happened only when I check a bunch of URLs. Any suggestions?
Here is the entire script:
from requests import async, defaults
from lxml import html
from urlparse import urlsplit
from gevent import monkey
from BeautifulSoup import UnicodeDammit
from ZODB.FileStorage import FileStorage
from ZODB.DB import DB
import transaction
import persistent
import random
storage = FileStorage('Data.fs')
db = DB(storage)
connection = db.open()
root = connection.root()
monkey.patch_all()
defaults.defaults['base_headers']['User-Agent'] = "Mozilla/5.0 (Windows NT 5.1; rv:11.0) Gecko/20100101 Firefox/11.0"
defaults.defaults['max_retries'] = 10
def save_data(source, target, anchor):
root[source] = persistent.mapping.PersistentMapping(dict(target=target, anchor=anchor))
transaction.commit()
def decode_html(html_string):
converted = UnicodeDammit(html_string, isHTML=True)
if not converted.unicode:
raise UnicodeDecodeError(
"Failed to detect encoding, tried [%s]",
', '.join(converted.triedEncodings))
# print converted.originalEncoding
return converted.unicode
def find_link(html_doc, url):
decoded = decode_html(html_doc)
doc = html.document_fromstring(decoded.encode('utf-8'))
for element, attribute, link, pos in doc.iterlinks():
if attribute == "href" and link.startswith('http'):
netloc = urlsplit(link).netloc
if "example.org" in netloc:
return (url, link, element.text_content().strip())
else:
return False
def check(response):
if response.status_code == 200:
html_doc = response.content
result = find_link(html_doc, response.url)
if result:
source, target, anchor = result
# print "Source: %s" % source
# print "Target: %s" % target
# print "Anchor: %s" % anchor
# print
save_data(source, target, anchor)
global todo
todo = todo -1
print todo
def load_urls(fname):
with open(fname) as fh:
urls = set([url.strip() for url in fh.readlines()])
urls = list(urls)
random.shuffle(urls)
return urls
if __name__ == "__main__":
urls = load_urls('urls.txt')
rs = []
todo = len(urls)
print "Ready to analyze %s pages" % len(urls)
for url in urls:
rs.append(async.get(url, hooks=dict(response=check), timeout=10.0))
responses = async.map(rs, size=100)
print "DONE."
I'm not sure what's the source of your problem, but why do you have monkey.patch_all() not at the top of the file?
Could you try putting
from gevent import monkey; monkey.patch_all()
at the top of your main program and see if it fixes anything?
I am such a big n00b but anyway, I can try ... !
I guess you can try to change your import list by this one :
from requests import async, defaults
import requests
from lxml import html
from urlparse import urlsplit
from gevent import monkey
import gevent
from BeautifulSoup import UnicodeDammit
from ZODB.FileStorage import FileStorage
from ZODB.DB import DB
import transaction
import persistent
import random
Try this and tell me if it works .. I guess that can solve your problem :)
good day.
I think it's open python bug with number Issue1596321
http://bugs.python.org/issue1596321
Related
app.py file code:
import webbrowser
import time
#!/usr/bin/env python
try:
# For Python 3.0 and later
from urllib.request import urlopen
except ImportError:
# Fall back to Python 2's urllib2
from urllib2 import urlopen
import certifi
import json
def get_jsonparsed_data(url):
"""
Receive the content of ``url``, parse it as JSON and return the object.
Parameters
----------
url : str
Returns
-------
dict
"""
response = urlopen(url, cafile=certifi.where())
data = response.read().decode("utf-8")
return json.loads(data)
url = ("https://financialmodelingprep.com/api/v3/quote/AAPL,FB?apikey=d099f1f81bf9a62d0f16b90c3dc3f718")
print(get_jsonparsed_data(url))
country = get_jsonparsed_data(url)
count = 0
for result in country:
if count == 0:
header = result.keys()
for head in header:
html_content = f"<div> {head} </div>"
count += 1
with open("index.html", "w") as html_file:
html_file.write(html_content)
print("Html file created successfully !!")
time.sleep(2)
webbrowser.open_new_tab("index.html")
passenger_wsgi.py file code:
import imp
import os
import sys
sys.path.insert(0, os.path.dirname(__file__))
wsgi = imp.load_source('wsgi', 'app.py')
application = wsgi.application
Error:
Traceback (most recent call last):
File "/home/stockpee/staging/passenger_wsgi.py", line 9, in <module>
application = wsgi.application
AttributeError: module 'wsgi' has no attribute 'application'
Traceback (most recent call last):
File "/home/stockpee/staging/passenger_wsgi.py", line 9, in <module>
application = wsgi.application
AttributeError: module 'wsgi' has no attribute 'application'
Hi,
Everyone, I am new in Python. I have develop a basic application on my local machine. But when I deployed it on A2Host hosting server. I am facing above error when I run my application in web browser.
Is anyone help me to fix above issue. I will be very thankful for that person.
I will applogize in advance for my brief answer. But it has to do with the fact that you have not established a whisky app.
You need to make sure in your main application file app.py you establish your application.
def app(environ, start_response):
pprint("whoo hooo. ")
I hope this helps.
import json
from six.moves.urllib_parse import urlencode
from six.moves.urllib_request import urlopen
from django.core.management.base import CommandError
def call(method, data, post=False):
"""
Calls `method` from the DISQUS API with data either in POST or GET.
Returns deserialized JSON response.
"""
url = "%s%s" % ("http://disqus.com/api/", method)
if post:
# POST request
url += "/"
data = urlencode(data)
else:
# GET request
url += "?%s" % urlencode(data)
data = ""
res = json.load(urlopen(url, data))
if not res["succeeded"]:
raise CommandError(
"'%s' failed: %s\nData: %s" % (method, res["code"], data)
)
return res["message"]
module) moves
Import "six.moves.urllib_parse" could not be resolved from sourcePylancereportMissingModuleSource
installed the six module
to Python virtual environment
six can be imported without problems,
Occurs from six.moves MissingModuleSource
Why can't Import? six.moves
try changing system interpreter path of python in your IDE and set it to the virtual environment you use, in which you have installed the module.
an example in vscode
I have a data in a website(which unfortunately couldn't be fetched using invokeHTTP). I am trying to fetch the data using a execute script processor.Here is what I have done in python.tried things in comments
#import http.client, urllib.request, urllib.parse, urllib.error, base64
import httplib
#import requests
import java.io
from org.apache.commons.io import IOUtils
from java.nio.charset import StandardCharsets
from org.apache.nifi.processor.io import OutputStreamCallback
import json
from datetime import datetime
from java.lang import Object
from jarray import array
from org.python.core.util import StringUtil
class WriteContentCallback(OutputStreamCallback):
def __init__(self, content):
self.content_text = content
def process(self, outputStream):
try:
outputStream.write(StringUtil.toBytes(self.content_text))
except Exception as error:
objArray = [error]
javaArray = array(objArray, Object)
log.error('Error processing the .csv file ===> {}', javaArray)
raise ValueError
try:
flowFile = session.create()
conn = httplib.HTTPSConnection(url_domain)
conn.request("GET", url)
response = conn.getresponse()
data = response.read()
#outputStream.write(StringUtil.toBytes(data))
#outputStream.write(bytearray(json.dumps(out)))
flowFile = session.write(flowFile, WriteContentCallback(data))
session.transfer(flowFile, REL_SUCCESS)
except Exception as outermost_error:
objArray = [outermost_error]
javaArray = array(objArray, Object)
log.error('Error processing the .csv file ===> {}', javaArray)
session.transfer(flowFile, REL_FAILURE)
The issue seems to be there are no errors as such...but the file doesnt show up(0 bytes) and the type is application/octetstream.What can I do to get the output from the website using execute script
Here is the nifi flow, the flow has been marked in yellow.
Let me know if you need any other details. the code seems to work as jython program.And also if i pass some random text in place of data,that works too.
import urllib request
import requests
goog_url = "https://query1.finance.yahoo.com/v7/finance/download/GOOG?period1=1501517722&period2=1504196122&interval=1d&events=history&crumb=bU42Yaj88Bt"
def download_stock_data(csv_url):
response = ur.urlopen(csv_url)
csv = response.read()
csv_str = str(csv)
lines = csv_str.split("\\n")
dest_url = r'goog.csv'
fx = open(dest_url, "w")
for line in lines:
fx.write(line + "\n")
fx.close()
download_stock_data(goog_url)
I'm trying to import a CSV file from the internet with this code. But I continue, despite my best efforts, to get a syntax error that says that it cannot find the request module of the urllib import.
File "/Users/Micmaster/PycharmProjects/pythonProject/firstProject.py", line 1
import urllib request
^
SyntaxError: invalid syntax
I've tried many different variations "from urllib import request", "import urllib.request", "import urllib", "import urllib2.request" and even changing versions of my interpreter on pycharm. Any help would be appreciated, thanks!
The urllib.request module is in Python 3 library;
For Python 2 you'd use urllib2.
I will write about python2.
Why are you trying import request python or object from the urllib package, when this package doesn't have it.
And on the next line your import requests. So use requests.
And I don't know why you need urllib, but change import urllib request to import urllib as ur.
import urllib
from urllib import request
goog_url = "https://query1.finance.yahoo.com/v7/finance/download/GOOG?period1=1501517722&period2=1504196122&interval=1d&events=history&crumb=bU42Yaj88Bt"
def download_stock_data(csv_url):
response = urllib.request.urlopen(csv_url)
csv = response.read()
csv_str = str(csv)
lines = csv_str.split("\\n")
dest_url = r'goog.csv'
fx = open(dest_url, "w")
for line in lines:
fx.write(line + "\n")
fx.close()
download_stock_data(goog_url)
Your code should look like this. But it is still showing HTTP Error 401: Unauthorized you had taken the wrong URL.
Grinder is new for me and I am trying to figure out how to get rid of this error:
my test.py script:
import string
import random
from java.lang import String
from java.net import URLEncoder
from net.grinder.plugin.http import HTTPRequest
from net.grinder.common import GrinderException
log = grinder.logger.info
stat = grinder.statistics.forLastTest
SERVER = "http://www.google.com"
URI = "/"
class TestRunner:
def __call__(self):
requestString = "%s%s" % (SERVER, URI)
request = HTTPRequest()
result = request.GET(requestString)
if string.find(result.getText(), "SUCCESS") < 1:
stat.setSuccess(0)
I run
java net.grinder.Console
java net.grinder.Grinder
in my localhost.
after starting the test, this message keeps popping up:
aborting process - Jython exception, <type 'exceptions.NameError'>: name 'grinder' is not defined [initialising test script]
net.grinder.scriptengine.jython.JythonScriptExecutionException: <type 'exceptions.NameError'>: name 'grinder' is not defined
log = grinder.logger.info
File "./test.py", line 8, in <module>
It looks like I have to include some Grinder module for this "grinder.logger.info" but I just have no clue of what I should import... ...
Any hint?
Thanks in advance
you imported items from grinder, not grinder itself, try
import grinder.logger.info
import grinder.statistics.forLastTest
it might also be net.grinder.logger.info and net.grinder.statistics.forLastTest if this is the case then your code will need changing to go with the change from grinder to net.grinder
You have not imported grinder.
from net.grinder.script.Grinder import grinder
and now try again.