I am running a python script with Phontomjs and Selenium. I am facing timeout issue. It is stopping after 20-50min. I need a solution so that I can run my script without this timeout issue. where is the problem please and how can I solve it?
The input file cannot be read or no in proper format.
Traceback (most recent call last):
File "links_crawler.py", line 147, in <module>
crawler.Run()
File "links_crawler.py", line 71, in Run
self.checkForNextPages()
File "links_crawler.py", line 104, in checkForNextPages
self.next.click()
File "/home/dev/.local/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 75, in click
self._execute(Command.CLICK_ELEMENT)
File "/home/dev/.local/lib/python2.7/site-packages/selenium/webdriver/remote/webelement.py", line 454, in _execute
return self._parent.execute(command, params)
File "/home/dev/.local/lib/python2.7/site-packages/selenium/webdriver/remote/webdriver.py", line 199, in execute
response = self.command_executor.execute(driver_command, params)
File "/home/dev/.local/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 395, in execute
return self._request(command_info[0], url, body=data)
File "/home/dev/.local/lib/python2.7/site-packages/selenium/webdriver/remote/remote_connection.py", line 463, in _request
resp = opener.open(request, timeout=self._timeout)
File "/usr/lib/python2.7/urllib2.py", line 431, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 449, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 409, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1227, in http_open
return self.do_open(httplib.HTTPConnection, req)
File "/usr/lib/python2.7/urllib2.py", line 1200, in do_open
r = h.getresponse(buffering=True)
File "/usr/lib/python2.7/httplib.py", line 1127, in getresponse
response.begin()
File "/usr/lib/python2.7/httplib.py", line 453, in begin
version, status, reason = self._read_status()
File "/usr/lib/python2.7/httplib.py", line 417, in _read_status
raise BadStatusLine(line)
httplib.BadStatusLine: ''
Code:
class Crawler():
def __init__(self,where_to_save, verbose = 0):
self.link_to_explore = ''
self.TAG_RE = re.compile(r'<[^>]+>')
self.TAG_SCRIPT = re.compile(r'<(script).*?</\1>(?s)')
if verbose == 1:
self.driver = webdriver.Firefox()
else:
self.driver = webdriver.PhantomJS()
self.links = []
self.next = True
self.where_to_save = where_to_save
self.logs = self.where_to_save + "/logs"
self.outputs = self.where_to_save + "/outputs"
self.logfile = ''
self.rnd = 0
try:
os.stat(self.logs)
except:
os.makedirs(self.logs)
try:
os.stat(self.outputs)
except:
os.makedirs(self.outputs)
try:
fin = open(file_to_read,"r")
FileContent = fin.read()
fin.close()
crawler =Crawler(where_to_save)
data = FileContent.split("\n")
for info in data:
if info!="":
to_process = info.split("|")
link = to_process[0].strip()
category = to_process[1].strip().replace(' ','_')
print "Processing the link: " + link : " + info
crawler.Init(link,category)
crawler.Run()
crawler.End()
crawler.closeSpider()
except:
print "The input file cannot be read or no in proper format."
raise
If you don't want Timeout to stop your script you can catch the exception
selenium.common.exceptions.TimeoutException and pass it.
You can set the default page load timeout using the set_page_load_timeout() method of webdriver.
Like this
driver.set_page_load_timeout(10)
This will throw a TimeoutException if your page didn't load in 10 seconds.
EDIT:
Forgot to mention that you will have to put your code in a loop.
Add import
from selenium.common.exceptions import TimeoutException
while True:
try:
# Your code here
break # Loop will exit
except TimeoutException:
pass
Related
I've made a very basic share price prediction script which essentially uses linear regression. Not accurate, but it's proof of concept for a uni project.
I want to add these prices to my website using the new PyScript Library. However, when i run the code on my website, I get this error:
PythonError: Traceback (most recent call last): File "/lib/python3.10/asyncio/futures.py", line 201, in result raise self._exception File "/lib/python3.10/asyncio/tasks.py", line 232, in __step result = coro.send(None) File "/lib/python3.10/site-packages/_pyodide/_base.py", line 506, in eval_code_async await CodeRunner( File "/lib/python3.10/site-packages/_pyodide/_base.py", line 357, in run_async coroutine = eval(self.code, globals, locals) File "", line 6, in File "/lib/python3.10/site-packages/pandas/util/_decorators.py", line 311, in wrapper return func(*args, **kwargs) File "/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 680, in read_csv return _read(filepath_or_buffer, kwds) File "/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 575, in _read parser = TextFileReader(filepath_or_buffer, **kwds) File "/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 933, in init self._engine = self._make_engine(f, self.engine) File "/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1217, in _make_engine self.handles = get_handle( # type: ignore[call-overload] File "/lib/python3.10/site-packages/pandas/io/common.py", line 670, in get_handle ioargs = _get_filepath_or_buffer( File "/lib/python3.10/site-packages/pandas/io/common.py", line 339, in _get_filepath_or_buffer with urlopen(req_info) as req: File "/lib/python3.10/site-packages/pandas/io/common.py", line 239, in urlopen return urllib.request.urlopen(*args, **kwargs) File "/lib/python3.10/urllib/request.py", line 216, in urlopen return opener.open(url, data, timeout) File "/lib/python3.10/urllib/request.py", line 519, in open response = self._open(req, data) File "/lib/python3.10/urllib/request.py", line 541, in _open return self._call_chain(self.handle_open, 'unknown', File "/lib/python3.10/urllib/request.py", line 496, in _call_chain result = func(*args) File "/lib/python3.10/urllib/request.py", line 1419, in unknown_open raise URLError('unknown url type: %s' % type) urllib.error.URLError:
This solution worked absolutely fine in pyCharm, so im assunming it's something to do with PyScript?
I'll link the code below.
<py-script>
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
urlAAPL = "https://raw.githubusercontent.com/Dan168/-DanCreates/main/AAPL.csv"
dataAAPL = pd.read_csv(urlAAPL)
XAAPL = dataAAPL.iloc[:, 0].values.reshape(-1, 1)
YAAPL = dataAAPL.iloc[:, 1].values.reshape(-1, 1)
urlAMZN = "https://raw.githubusercontent.com/Dan168/-DanCreates/main/AMZN.csv"
dataAMZN = pd.read_csv(urlAMZN)
XAMZN = dataAMZN.iloc[:, 0].values.reshape(-1, 1)
YAMZN = dataAMZN.iloc[:, 1].values.reshape(-1, 1)
urlTSLA = "https://raw.githubusercontent.com/Dan168/-DanCreates/main/TSLA.csv"
dataTSLA = pd.read_csv(urlTSLA)
XTSLA = dataTSLA.iloc[:, 0].values.reshape(-1, 1)
YTSLA = dataTSLA.iloc[:, 1].values.reshape(-1, 1)
urlNIO = "https://raw.githubusercontent.com/Dan168/-DanCreates/main/NIO.csv"
dataNIO = pd.read_csv(urlNIO)
XNIO = dataNIO.iloc[:, 0].values.reshape(-1, 1)
YNIO = dataNIO.iloc[:, 1].values.reshape(-1, 1)
def Predict(X, Y, Name):
lr = LinearRegression()
lr.fit(X, Y)
Y_pred = lr.predict(X)
oneDay = 127
oneWeek = 134
oneMonth = 156
print(str(Name))
print("Price prediction one day day from now: ")
print(lr.predict(np.array([oneDay]).reshape(-1, 1)))
print("Price prediction one week day from now: ")
print(lr.predict(np.array([oneWeek]).reshape(-1, 1)))
print("Price prediction one month day from now: ")
print(lr.predict(np.array([oneMonth]).reshape(-1, 1)))
Predict(XAAPL, YAAPL, "AAPL")
Predict(XNIO, YNIO, "NIO")
Predict(XTSLA, YTSLA, "TSLA")
Predict(XAMZN, YAMZN, "AMZN")
</py-script>
Ran the code in PyCharm and it worked absolutely fine. Ran fine on my website before when it was just checking one CSV. When I added the method and more stocks to check, this is when I encounted the error - hence making me thing it is something to do with the method??
Thanks in advance
So I have roughly 6600 image URLs that I want to read them from a pickle file in order to download their images locally. The two main problems I could not solve:
1) URLs that throw UnicodeEncodeError error. I tried many available solution online and none worked so I ended up forgetting about that URL but I prefer if there is a way to hack it. This is URLs that contain Chinese of Polaska characters for example.
2) The other error is as follows (one example):
ssl.CertificateError: hostname 'www.straitstimes.com.sg' doesn't match either of 'gp1.adn.edgecastcdn.net', 'gs1.adn.edgecastcdn.net', 'ne1.adn.edgecastcdn.net', 'www.uship.com', 'www.bluefly.com', 'www.belleandclive.com', 'is.belleandclive.com', 'connections.cochlear.com', 'assets.pokemon.com', 'www.shopperschoice.com', 'www.biznessapps.com', 'cdn.shocho.co', 'secure.hibustudio.com', 'www.stardoll.com', 'adn.wiredrive.com', 'www.speedtest.net', 'www.cduniverse.com', 'ak-site-origin-cover.cduniverse.com', 'cover.cduniverse.com', 'g.cduniverse.com', 'www.renttherunway.com', 'cdn2.navexglobal.com', 'www.chdist.com', 'www.thefanorama.com', 'cdn2.mediasilo.com', 'cdn.citadoncw.com', 'www.woodcraft.com', 'marketing-admin.upsight-api.com', 'www.edgecast.com'
It happened when I needed to download the image from https://static.straitstimes.com.sg/sites/default/files/styles/x_large/public/articles/2017/09/23/ST_20170923_WONZ_3440048.jpg?itok=-iG-zSvr as you see the image is available.
I wonder how my code below could be modified to overcome these two issues rather than try except phrase.
import pickle
import urllib.request
import re
import requests
from urllib.parse import quote
# img = urllib.request.urlopen(quote(value))
# img = urllib.urlopen(quote(value))
import time
count = 0
with open('images.pickle', 'rb') as handle:
b = pickle.load(handle)
print(len(b))
for key, value in b.items():
print(value)
print(key, value)
#value = iriToUri(value)
if value != 'NA':
count += 1
print(value)
try:
img = urllib.request.urlopen(quote(value, "\./_-:"))
#img = requests.get(value)
split = urllib.parse.urlsplit(value)
extension = split.path.split(".")[-1]
print(extension)
if extension.lower() == 'jpg':
filename = "immigration_images/" + str(key) + ".jpg"
elif extension.lower() == 'jpeg':
filename = "immigration_images/" + str(key) + ".jpeg"
elif extension.lower() == 'ico':
filename = "immigration_images/" + str(key) + ".ico"
else:
filename = "immigration_images/" + str(key) + ".png"
img_extension = img.info()['Content-Type']
#print(img_extension)
if img_extension:
if 'jpeg' in img_extension:
filename = "immigration_images/" + str(key) + ".jpeg"
elif 'jpg' in img_extension:
filename = "immigration_images/" + str(key) + '.jpg'
elif 'png' in img_extension:
filename = "immigration_images/" + str(key) + '.png'
#urllib.request.urlretrieve(value, filename)
urllib.request.urlretrieve(value, filename)
except (urllib.error.ContentTooShortError, urllib.error.URLError, urllib.error.HTTPError, UnicodeEncodeError) as e:
print(e)
continue
print(count)
I am using Python 3.6.3 :: Anaconda custom (64-bit) in CentOS Linux release 7.4.1708 (Core)
trackback:
Traceback (most recent call last):
File "/scratch2/news_bias/test_pickle.py", line 39, in <module>
img = urllib.request.urlopen(quote(value, "\./_-:"))
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 223, in urlopen
return opener.open(url, data, timeout)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 532, in open
response = meth(req, response)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 642, in http_response
'http', request, response, code, msg, hdrs)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 564, in error
result = self._call_chain(*args)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 756, in http_error_302
return self.parent.open(new, timeout=req.timeout)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 526, in open
response = self._open(req, data)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 544, in _open
'_open', req)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 504, in _call_chain
result = func(*args)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 1361, in https_open
context=self._context, check_hostname=self._check_hostname)
File "/scratch/sjn/anaconda/lib/python3.6/urllib/request.py", line 1318, in do_open
encode_chunked=req.has_header('Transfer-encoding'))
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1026, in _send_output
self.send(msg)
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 964, in send
self.connect()
File "/scratch/sjn/anaconda/lib/python3.6/http/client.py", line 1400, in connect
server_hostname=server_hostname)
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 407, in wrap_socket
_context=self, _session=session)
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 814, in __init__
self.do_handshake()
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 1068, in do_handshake
self._sslobj.do_handshake()
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 694, in do_handshake
match_hostname(self.getpeercert(), self.server_hostname)
File "/scratch/sjn/anaconda/lib/python3.6/ssl.py", line 327, in match_hostname
% (hostname, ', '.join(map(repr, dnsnames))))
I know this error has been discussed quite a bit, but it seems that there is a different cause in each case.
I am using the following code and selenium to extract some data from a website and get the error mentioned above during the second call of browser.get(url).
import openpyxl, os
from selenium import webdriver
os.chdir('C://Users/user/Documents')
os.makedirs('GenBank Data', exist_ok = True)
book = openpyxl.load_workbook('Squirrel list 50 percent genus.xlsx')
sheet = book.active
dirs = 'C://Users/user/Documents/GenBank Data'
os.chdir(dirs)
browser = webdriver.Chrome(executable_path = 'C://Users/user/chromedriver.exe',
service_args = ['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
start_col = 7
end_col = 9
start_row = 2
end_row = 160
url_root = 'https://www.ncbi.nlm.nih.gov/nuccore/'
url_end = '.1?report=fasta'
for y in range(start_col, end_col + 1):
file = open(sheet.cell(row = 1, column = y).value, 'w')
for x in range(start_row, end_row + 1):
accession = sheet.cell(row = x, column = y).value
if accession:
print(accession)
url = url_root + accession + url_end
browser.get(url)
data = browser.find_element_by_tag_name('pre')
file.write(data.text + '\n' + '\n')
browser.quit()
file.close()
I'm using my own machine and have limited knowledge of servers and ports, which seem to be the focus of answers to similar questions. Any help would be appreciated.
I've copied the traceback below.
Traceback (most recent call last):
File "<ipython-input-1-b8f523f5e981>", line 1, in <module>
runfile('C:/Users/Alec/test.py', wdir='C:/Users/Alec')
File "C:\Users\Alec\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 880, in runfile
execfile(filename, namespace)
File "C:\Users\Alec\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/Alec/test.py", line 38, in <module>
browser.get(url)
File "C:\Users\Alec\selenium\webdriver\remote\webdriver.py", line 309, in get
self.execute(Command.GET, {'url': url})
File "C:\Users\Alec\selenium\webdriver\remote\webdriver.py", line 295, in execute
response = self.command_executor.execute(driver_command, params)
File "C:\Users\Alec\selenium\webdriver\remote\remote_connection.py", line 464, in execute
return self._request(command_info[0], url, body=data)
File "C:\Users\Alec\selenium\webdriver\remote\remote_connection.py", line 487, in _request
self._conn.request(method, parsed_url.path, body, headers)
File "C:\Users\Alec\Anaconda3\lib\http\client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\Alec\Anaconda3\lib\http\client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\Alec\Anaconda3\lib\http\client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\Alec\Anaconda3\lib\http\client.py", line 1026, in _send_output
self.send(msg)
File "C:\Users\Alec\Anaconda3\lib\http\client.py", line 964, in send
self.connect()
File "C:\Users\Alec\Anaconda3\lib\http\client.py", line 936, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\Alec\Anaconda3\lib\socket.py", line 722, in create_connection
raise err
File "C:\Users\Alec\Anaconda3\lib\socket.py", line 713, in create_connection
sock.connect(sa)
ConnectionRefusedError: [WinError 10061] No connection could be made because the target machine actively refused it
Even the website I try in the above code works well through the first iteration
That helped me spot the issue in your code.
accession = sheet.cell(row = x, column = y).value
if accession:
print(accession)
url = url_root + accession + url_end
browser.get(url)
data = browser.find_element_by_tag_name('pre')
file.write(data.text + '\n' + '\n')
browser.quit()
In your if statement you quit the browser and then loop again and try to get the URL using the same browser, which is no longer there. That's why a socket connection error occurs.
The solution is to move browser.quit() to the end of the code, outside the for loop.
from nltk.tag.perceptron import PerceptronTagger
tagger = PerceptronTagger()
def pos_tagging(X):
tagset = None
tokens = nltk.word_tokenize(X)
tags = nltk.tag._pos_tag(tokens, tagset, tagger)
pos_X = map(get_wordnet_pos, tags)
return pos_X
AI.py is my python file
Traceback (most recent call last):
File "C:/Users/jpsamaranayake/PycharmProjects/AI/AI.py", line 206, in <module>
tagger = PerceptronTagger()
File "C:\Python27\lib\site-packages\nltk\tag\perceptron.py", line 141, in __init__
self.load(AP_MODEL_LOC)
File "C:\Python27\lib\site-packages\nltk\tag\perceptron.py", line 209, in load
self.model.weights, self.tagdict, self.classes = load(loc)
File "C:\Python27\lib\site-packages\nltk\data.py", line 801, in load
opened_resource = _open(resource_url)
File "C:\Python27\lib\site-packages\nltk\data.py", line 924, in _open
return urlopen(resource_url)
File "C:\Python27\lib\urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "C:\Python27\lib\urllib2.py", line 431, in open
response = self._open(req, data)
File "C:\Python27\lib\urllib2.py", line 454, in _open
'unknown_open', req)
File "C:\Python27\lib\urllib2.py", line 409, in _call_chain
result = func(*args)
File "C:\Python27\lib\urllib2.py", line 1265, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib2.URLError: <urlopen error unknown url type: c>
change the nltk version to 3.1 and delete previous avareage perceptron tagger in nltk.download() and install it again
At work I have to access/work with the Channel Advisor API
Source:
I'm attempting to perform a simple OrderService
from suds.client import Client
wsdl_url = 'https://api.channeladvisor.com/ChannelAdvisorAPI/v6/OrderService.asmx?WSDL'
service_url = 'https://api.channeladvisor.com/ChannelAdvisorAPI/v6/OrderService.asmx'
headers = {'Content-Type': 'text/xml; charset=utf-8'}
client = Client(wsdl_url, location = service_url, headers=headers)
Here is error:
Traceback (most recent call last):
File "test_suds.py", line 9, in <module>
client = Client(wsdl_url, location = service_url, headers=headers)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/client.py", line 112, in __init__
self.wsdl = reader.open(url)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/reader.py", line 152, in open
d = self.fn(url, self.options)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/wsdl.py", line 136, in __init__
d = reader.open(url)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/reader.py", line 79, in open
d = self.download(url)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/reader.py", line 95, in download
fp = self.options.transport.open(Request(url))
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/transport/https.py", line 60, in open
return HttpTransport.open(self, request)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/transport/http.py", line 62, in open
return self.u2open(u2request)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/suds-0.4.1-py2.7.egg/suds/transport/http.py", line 118, in u2open
return url.open(u2request, timeout=tm)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 404, in open
response = self._open(req, data)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 422, in _open
'_open', req)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 382, in _call_chain
result = func(*args)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1222, in https_open
return self.do_open(httplib.HTTPSConnection, req)
File "/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/urllib2.py", line 1184, in do_open
raise URLError(err)
urllib2.URLError: <urlopen error [Errno 54] Connection reset by peer>