So I have the following lines of code in a function
sock = urllib.urlopen(url)
html = sock.read()
sock.close()
and they work fine when I call the function by hand. However, when I call the function in a loop (using the same urls as earlier) I get the following error:
> Traceback (most recent call last):
File "./headlines.py", line 256, in <module>
main(argv[1:])
File "./headlines.py", line 37, in main
write_articles(headline, output_folder + "articles_" + term +"/")
File "./headlines.py", line 232, in write_articles
print get_blogs(headline, 5)
File "/Users/michaelnussbaum08/Documents/College/Sophmore_Year/Quarter_2/Innovation/Headlines/_code/get_content.py", line 41, in get_blogs
sock = urllib.urlopen(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 87, in urlopen
return opener.open(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 203, in open
return getattr(self, name)(url)
File "/System/Library/Frameworks/Python.framework/Versions/2.6/lib/python2.6/urllib.py", line 314, in open_http
if not host: raise IOError, ('http error', 'no host given')
IOError: [Errno http error] no host given
Any ideas?
Edit more code:
def get_blogs(term, num_results):
search_term = term.replace(" ", "+")
print "search_term: " + search_term
url = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q='+search_term+'&ie=utf-8&num=10&output=rss'
print "url: " +url
#error occurs on line below
sock = urllib.urlopen(url)
html = sock.read()
sock.close()
def write_articles(headline, output_folder, num_articles=5):
#calls get_blogs
if not os.path.exists(output_folder):
os.makedirs(output_folder)
output_file = output_folder+headline.strip("\n")+".txt"
f = open(output_file, 'a')
articles = get_articles(headline, num_articles)
blogs = get_blogs(headline, num_articles)
#NEW FUNCTION
#the loop that calls write_articles
for term in trend_list:
if do_find_max == True:
fill_search_term(term, output_folder)
headlines = headline_process(term, output_folder, max_headlines, do_find_max)
for headline in headlines:
try:
write_articles(headline, output_folder + "articles_" + term +"/")
except UnicodeEncodeError:
pass
I had this problem when a variable I was concatenating with the url, in your case search_term
url = 'http://blogsearch.google.com/blogsearch_feeds?hl=en&q='+search_term+'&ie=utf-8&num=10&output=rss'
had a newline character at the end. So make sure you do
search_term = search_term.strip()
You might also want to do
search_term = urllib2.quote(search_term)
to make sure your string is safe for a url
use urllib2 instead if you don't want to handle reading on a per block basis yourself.
This probably does what you expect.
import urllib2
req = urllib2.Request(url='http://stackoverflow.com/')
f = urllib2.urlopen(req)
print f.read()
In your function's loop, right before the call to urlopen, perhaps put a print statement:
print(url)
sock = urllib.urlopen(url)
This way, when you run the script and get the IOError, you will see the url which is causing the problem. The error "no host given" can be replicated if url equals something like 'http://'...
Related
I`m trying to download a book link from a different file but when I executed... the file at the beginning runs well but it crash
downloading: http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu
Traceback (most recent call last):
File "C:/Users/he/Desktop/New folder (4)/6.py", line 23, in <module>
exec(l)
File "<string>", line 1, in <module>
File "C:/Users/he/Desktop/New folder (4)/6.py", line 9, in download_url
with open(file_name, 'wb') as f:
OSError: [Errno 22] Invalid argument: '%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu'
Here is the code:
import requests
from bs4 import BeautifulSoup
def download_url(url):
print("downloading: ", url)
file_name_start_pos = url.rfind("/") + 1
file_name = url[file_name_start_pos:]
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
for data in r:
f.write(data)
def n(f):
s = []
for l in range(1):
l = f.readline()
if l:
s.append(l)
return s
b = open('1.txt')
c = n(b)
while len(c) > 0:
for l in c:
exec(l)
c = n(b)
b.close()
File link:
download_url("http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu")
You have link with some chars converted to percent values - to create url safe for internet - but this is not correct string for filename because system may not use % in filenames.
But you can convert it back using urllib.parser.unquote()
import urllib.parse
name = urllib.parse.unquote("%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4 %D1%80.%20-%20%D0%93%D0%B
8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0 %2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%
8B%20 %D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4 %D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D
0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F %20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82 %D0%B5
%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu")
print(name)
Result:
'Башта Т.М., и д р. - Гидравлика , гидромашины и гидропривод ы (Учебник для машиностроит ельных вузов).djvu'
There is a problem with your file name.
You cannot use %D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4 %D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0 %2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20 %D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4 %D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F %20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82 %D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu As file name
I guess you want to use e5772a162f57b7c2b22c7ec8f6883002 as the file name, if not, you can choose another name.
If it's not you revising.
I have modified your code, I have successfully downloaded the file of your URL, and can view it without problems.
import requests
url = 'http://31.42.184.140/main/94000/e5772a162f57b7c2b22c7ec8f6883002/%D0%91%D0%B0%D1%88%D1%82%D0%B0%20%D0%A2.%D0%9C.%2C%20%D0%B8%20%D0%B4%D1%80.%20-%20%D0%93%D0%B8%D0%B4%D1%80%D0%B0%D0%B2%D0%BB%D0%B8%D0%BA%D0%B0%2C%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D1%8B%20%D0%B8%20%D0%B3%D0%B8%D0%B4%D1%80%D0%BE%D0%BF%D1%80%D0%B8%D0%B2%D0%BE%D0%B4%D1%8B%20%28%D0%A3%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%20%D0%B4%D0%BB%D1%8F%20%D0%BC%D0%B0%D1%88%D0%B8%D0%BD%D0%BE%D1%81%D1%82%D1%80%D0%BE%D0%B8%D1%82%D0%B5%D0%BB%D1%8C%D0%BD%D1%8B%D1%85%20%D0%B2%D1%83%D0%B7%D0%BE%D0%B2%29.djvu'
def download_url(url):
print("downloading: ", url)
file_name = f"{url.split('/')[5]}.{url.split('.')[-1]}"
print(url.split('/')[5])
r = requests.get(url, stream=True)
if r.status_code == requests.codes.ok:
with open(file_name, 'wb') as f:
f.write(r.content)
download_url(url=url)
I'm working on a web scraping project with BeautifulSoup and at one step in it I need to compile a list of links off of another list of links which I have saved to a file. The loop seems to run fine until it gets to the last line of the file, at which point it will throw an error requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?. Full code and traceback below
Does this have to do with the fact that python is reading each row in my .txt file as a list? I also tried only having 1 for loop like
for link in season_links:
response_loop = requests.get(link[0])
But it didn't address the error.
Here is my code:
Contents of file:
https://rugby.statbunker.com/competitions/LastMatches?comp_id=98&limit=10&offs=UTC
https://rugby.statbunker.com/competitions/LastMatches?comp_id=99&limit=10&offs=UTC
# for reading season links from file
season_links = []
season_links_file = codecs.open('season_links_unpag_tst2.txt', 'r')
for line in season_links_file:
stripped_line = line.strip()
line_list = stripped_line.split()
season_links.append(line_list)
season_links_file.close()
print('Season links file read complete' + '\n')
print(season_links)
# handling for pagination within each season
for link in season_links:
t0 = time.time()
for item in link: # for some reason it reads each row in my .txt as a list, so I have to loop over it again
response_loop = requests.get(item)
html_loop = response_loop.content
soup_loop = BeautifulSoup(html_loop, 'html.parser')
for p in soup_loop.find_all('p', text='›'):
season_links.append(p.find_parent('a').get('href'))
print('Season link: ' + item)
response_delay = time.time() - t0
print('Loop duration: ' + str(response_delay))
time.sleep(4*response_delay)
print('Sleep: ' + str(response_delay*4) + '\n')
Traceback
Season link: https://rugby.statbunker.com/competitions/LastMatches?comp_id=1&limit=10&offs=UTC
Loop duration: 2.961906909942627
Sleep: 11.847627639770508
Season link: https://rugby.statbunker.com/competitions/LastMatches?comp_id=103&limit=10&offs=UTC
Loop duration: 1.6234941482543945
Sleep: 6.493976593017578
Traceback (most recent call last):
File "/Users/claycrosby/Desktop/coding/projects/gambling/scraper/sb_compile_games.py", line 103, in <module>
response_loop = requests.get(item)
File "/opt/miniconda3/envs/ds383/lib/python3.8/site-packages/requests/api.py", line 76, in get
return request('get', url, params=params, **kwargs)
File "/opt/miniconda3/envs/ds383/lib/python3.8/site-packages/requests/api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "/opt/miniconda3/envs/ds383/lib/python3.8/site-packages/requests/sessions.py", line 516, in request
prep = self.prepare_request(req)
File "/opt/miniconda3/envs/ds383/lib/python3.8/site-packages/requests/sessions.py", line 449, in prepare_request
p.prepare(
File "/opt/miniconda3/envs/ds383/lib/python3.8/site-packages/requests/models.py", line 314, in prepare
self.prepare_url(url, params)
File "/opt/miniconda3/envs/ds383/lib/python3.8/site-packages/requests/models.py", line 388, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'h': No schema supplied. Perhaps you meant http://h?
[Finished in 23.3s with exit code 1]
EDIT: I have tried printing each item and I find there's a 3rd one that comes out just called h. There is no whitespace or h in my file though
The issue stemmed from the fact that I was trying to append to the original list from the loop. I used different lists and it processed without an error
# for reading season links from file
season_links_unpag = []
season_links_file = codecs.open('season_links_unpag_tst2.txt', 'r')
for line in season_links_file:
stripped_line = line.strip()
line_list = stripped_line.split()
season_links_unpag.append(line_list)
season_links_file.close()
print('Season links file read complete' + '\n')
print(season_links_unpag)
# handling for pagination within each season
season_links = []
for link in season_links_unpag:
t0 = time.time()
for item in link:
print(item)
response_loop = requests.get(item)
html_loop = response_loop.content
soup_loop = BeautifulSoup(html_loop, 'html.parser')
for p in soup_loop.find_all('p', text='›'):
season_links.append(p.find_parent('a').get('href'))
print('Season link: ' + item)
response_delay = time.time() - t0
print('Loop duration: ' + str(response_delay))
time.sleep(4*response_delay)
print('Sleep: ' + str(response_delay*4) + '\n')
I have a text file which contains a list of URLs and I am willing to print the contents of the URL in another text file, along with the URL as the header. I have used this project file https://pypi.org/project/Wikipedia-API/ to extract the content, but I would have to enter the link one after another, which I do not want to delve into, since my list is huge, with at least 3000 links per text file.
Can anyone help me with this, it would be highly appreciated.
EDIT:
I have tried this in the following way, but there is no content in the output txt file.
import urllib
import datetime as dt
from datetime import datetime
import time
linklist = []
with open ("test.txt", 'r', encoding = 'utf=8') as wikitxt :
#content = wikitxt.read().splitlines()
for i in wikitxt:
linklist.append (i)
output = open('Wikipedia_content.txt', 'w', encoding='utf-8')
startTime = time.time()
endTime = time.time()
runTime = endTime - startTime
print("Runtime is %3f seconds" % runTime)
Here is the txt file that I have used https://pastebin.com/Y4bwsHGB , and this is the text file that I need to use : https://pastebin.com/SXDAu8jV.
Thanks in advance.
PROBLEM:
Traceback (most recent call last):
File "C:/Users/suva_/Desktop/Project specification/data/test2.py", line 13, in <module>
output_file.write((urlopen(link).read()))
File "D:\Python 36\lib\urllib\request.py", line 228, in urlopen
return opener.open(url, data, timeout)
File "D:\Python 36\lib\urllib\request.py", line 531, in open
response = self._open(req, data)
File "D:\Python 36\lib\urllib\request.py", line 554, in _open
'unknown_open', req)
File "D:\Python 36\lib\urllib\request.py", line 509, in _call_chain
result = func(*args)
File "D:\Python 36\lib\urllib\request.py", line 1389, in unknown_open
raise URLError('unknown url type: %s' % type)
urllib.error.URLError: <urlopen error unknown url type: https>
FINAL FIX:
import urllib
import datetime as dt
from datetime import datetime
import requests
import time
import re
import html2text
startTime = time.time()
def text_opener():
linklist=[]
with open ("test.txt", 'r', encoding = 'utf=8') as wikitxt :
#content = wikitxt.read().splitlines()
for i in wikitxt:
try:
linklist.append(i.strip())
except UnicodeEncodeError as enror:
linklist.append ("")
return linklist
linklist = text_opener() # put the content in a list and then opened the text
'''
This is a string of characters which I wanted to remove from the URL content
rejectedChar = list('!"#$%&\'()*+,-./:;<=>?#[\\]^_`{|}~0123456789')
rejectedChar.append("\t")
special="\t"
regexWords = r"[\w']+"
'''
'''STOPWORDS LIST WHICH CONTAINS A BUNCH OF WORDS WHICH I DON"T NEED TO BE PRINTED--- ONLY FOR LARGE FILES
#stopwords = []
#with open('stopwords.txt', 'r', encoding='utf-8') as inFile:
# for i in inFile:
# stopwords.append(i.strip())
'''
content = ""
count = 0
for i in linklist:
print(count," ",i.encode('utf-8'))
count+=1
try:
f = urllib.request.urlopen(i).read()
content+=str(f)
except Exception as e:
continue
#print((linklist[0:4000]).encode('utf-8'))
#combinedstops= rejectedChar+stopwords # combining them together
#for item in combinedstops:
#content=content.replace(item,"") # now this items are removed from the
#content
def output_file (content):
with open('June_wikipedia_content.txt', 'w', encoding = 'utf-8') as output:
output.write(str(content))
## try:
## output_file (content)
## except UnicodeEncodeError as enror:
## print ("Got lost in the game")
#sky=open("sky.txt",'w')
#sky.write(str(content))
output_file (content)
#print("hahahahahaha",stopwords)
#for i in content:
# i = re.findall(regexWords, i)
# i = [i for i in i if i in stopwords]
startTime = time.time()
endTime = time.time()
runTime = endTime - startTime
print("Runtime is %3f seconds" % runTime)
You can use the following function to open the text file and store all the links in a list:
with open('links.txt') as f:
content = f.read().splitlines()
The variable content is a list with each element containing the string associated with a URL. This will only work though if your links.txt has the URL's arranged line by line i.e:
www.google.co.in
www.wikipedia.co.in
www.youtube.co.in
Once you get this list you can iterate through it with a simple for loop and do what you desire.
If you want a more detailed answer I suggest posting an example text file of the links.
EDIT :
This works but it dumps the whole data into the file. The data is not formatted correctly. Is this what you need ?
from urllib.request import urlopen
with open('links.txt') as f:
content = f.read().splitlines()
with open('Wikipedia_content.txt', 'w') as output_file:
for link in content :
output_file.write(link)
output_file.write((urlopen(link).read()))
I am trying to update the url with keywords from another file and read the contents of the url but it is throwing error
f = open('myfile.txt')
for line in iter(f):
id = line.strip('\n')
url_sell = 'https://example.com/getmarketsummary?market='+str(id)
df = pd.read_json(url_sell, orient='columns')
Below is the error
urllib2.URLError: <urlopen error no host given>
Use try, excep to debug.
for line in iter(f):
id = line.strip('\n')
try:
url_sell = 'https://example.com/getmarketsummary?market='+str(id)
except:
return id
import requests
from bs4 import BeautifulSoup
import csv
from urlparse import urljoin
import urllib2
outfile = open("./battingall.csv", "wb")
writer = csv.writer(outfile)
base_url = 'http://www.baseball-reference.com'
player_url = 'http://www.baseball-reference.com/players/'
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
players = 'shtml'
gamel = '&t=b&year='
game_logs = 'http://www.baseball-reference.com/players/gl.cgi?id='
years = ['2015','2014','2013','2012','2011','2010','2009','2008']
drounders = []
for dround in alphabet:
drounders.append(player_url + dround)
urlz = []
for ab in drounders:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
yent = []
for ant in urlz:
for d in drounders:
for y in years:
if players in ant:
if len(ant) < 60:
if d in ant:
yent.append(game_logs + ant[44:-6] + gamel + y)
for j in yent:
try:
data = requests.get(j)
soup = BeautifulSoup(data.content)
table = soup.find('table', attrs={'id': 'batting_gamelogs'})
tablea = j[52:59]
tableb= soup.find("b", text='Throws:').next_sibling.strip()
tablec= soup.find("b", text='Height:').next_sibling.strip()
tabled= soup.find("b", text='Weight:').next_sibling.strip()
list_of_rows = []
for row in table.findAll('tr'):
list_of_cells = []
list_of_cells.append(tablea)
list_of_cells.append(j[len(j)-4:])
list_of_cells.append(tableb)
list_of_cells.append(tablec)
list_of_cells.append(tabled)
for cell in row.findAll('td'):
text = cell.text.replace(' ', '').encode("utf-8")
list_of_cells.append(text)
list_of_rows.append(list_of_cells)
print list_of_rows
writer.writerows(list_of_rows)
except (AttributeError,NameError):
pass
When I run this code to get gamelog batting data I keep getting an error:
Traceback (most recent call last):
File "battinggamelogs.py", line 44, in <module>
data = requests.get(j)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 65, in get
return request('get', url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site- packages/requests/api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 461, in request
resp = self.send(prep, **send_kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/sessions.py", line 573, in send
r = adapter.send(request, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/requests/adapters.py", line 415, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', BadStatusLine("''",))
I need a way to bypass this error to keep going. I think the reason the error comes up because there is no table to get data from.
You can wrap your requests.get() block in a try/except. You need to catch the requests.exceptions.ConnectionError that is being generated.
for ab in drounders:
try:
data = requests.get(ab)
soup = BeautifulSoup(data.content)
for link in soup.find_all('a'):
if link.has_attr('href'):
urlz.append(base_url + link['href'])
except requests.exceptions.ConnectionError:
pass
This is occurring because the connection, itself, has a problem, not because there is no data in the table. You aren't even getting that far.
Note: This is completely eating the exception by simply using pass (as you are also doing later in the code block). It may be better to do something like this:
except requests.exceptions.ConnectionError:
print("Failed to open {}".format(ab))
This will provide you with a message on the console of what URL is failing.