I am iterating through multiple pages with the same url except for the number at the end. Once it reaches a 404, however, it freezes the program, even though I am catching the exception in a try block. Am I missing something here? Here is my code. The program hangs once it hits https://www.tenable.com/plugins/nessus/14587
import bs4 as bs
from urllib.request import urlopen, Request
import urllib
ID = 14580
while ID < 132734:
#ID == 14391
ID == 14580
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
reg_url = "https://www.tenable.com/plugins/nessus/" + str(ID)
req = Request(url=reg_url, headers=headers)
try:
source = urlopen(req).read()
except urllib.error.HTTPError as e:
if e.getcode() == 404: # check the return code
continue
raise
soup = bs.BeautifulSoup(source,'lxml')
print(ID)
print(reg_url)
print(soup.title.string)
ID += 1
UPDATED WORKING CODE:
import bs4 as bs
from urllib.request import urlopen, Request
import urllib
ID = 14580
while ID < 132734:
#ID == 14391
ID == 14580
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}
reg_url = "https://www.tenable.com/plugins/nessus/" + str(ID)
req = Request(url=reg_url, headers=headers)
try:
source = urlopen(req).read()
except urllib.error.HTTPError as e:
if e.getcode() == 404: # check the return code
ID +=1
continue
raise
soup = bs.BeautifulSoup(source,'lxml')
print(ID)
print(reg_url)
print(soup.title.string)
ID += 1
I added another increment to ID inside the exception block as seen in the updated code and it works fine now
Related
How do I fix TypeError, Parsing, and all other errors in my python Yahoo Finance Webscraper. I cannot get my code to pull from Yahoo finance. Any fixes? It looks like span classes are the problem since they were removed and replaced by fin-streamer.
Error:
error
Code:
import requests
from bs4 import BeautifulSoup
def create_url():
symbol = str(input('Enter Stock Symbol: '))
url = f'https://finance.yahoo.com/quote/{symbol}'
return url
def get_html(url):
header = {"User Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
response = requests.get(url, headers = header)
if response.status_code == 200:
return response.text
else:
return None
def parse_data(html):
soup = BeautifulSoup(html,'html.parser')
name = soup.find('h1', {'class': 'D(ib) Fz(18px)'}).text
price = soup.find('fin-streamer', {'class': 'D(ib) Mend(20px)'}).find_all('fin-streamer')[0].text
change = soup.find('fin-streamer', {'class': 'D(ib) Mend(20px)'}).find_all('fin-streamer')[1].text
previous_close = soup.find('fin-streamer', {'class': 'Trsdu(0.3s)'}).text
open_price = soup.find('td',{'class':'Ta(end) Fw(600) Lh(14px)'}).text
print(f'|Stock Name: {name}|', f'|Stock Price: ${price}|', f'|Change: {change}|', f'|Previous Close: ${previous_close}|', f'|Open Price: ${open_price}|')
# print(f'Stock Price: ${price}')
# print(f'Change: {change}')
# print(f'Previous Close: ${previous_close}')
# print(f'Open Price: ${open_price}')
stock_data = {
'name':name,
'price':price,
'change':change ,
'previous_close': previous_close,
'open_price': open_price
}
return stock_data
def main():
# get users input
url = create_url()
# get html
html = get_html(url)
# while loop
i = True
while i:
# parse data
data = parse_data(html)
if __name__ == '__main__':
main()
def create_url():
symbol = str(input('Enter Stock Symbol: '))
url = f'https://finance.yahoo.com/quote/%7Bsymbol%7D'
return url
This function contains an error: in order to interpolate your symbol into the url, you need to do something like this:
url = f'https://finance.yahoo.com/quote/{symbol}'
As a result, you where GET'ing the wrong URL, but your function get_html() returns None if you fail to get a status 200. That None gets passed to BeautifulSoup as HTML, generating your error.
It's good that your get_html() function checks the status, but it should be failing if status indicates failure.
Update: that was a copy and paste error.
Your error is caused by passing None to BeautifulSoup - you can confirm this by running:
>>> from bs4 import BeautifulSoup
>>> soup = BeautifulSoup(None, 'html.parser')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "/private/tmp/venv/lib/python3.9/site-packages/bs4/__init__.py", line 312, in __init__
elif len(markup) <= 256 and (
TypeError: object of type 'NoneType' has no len()
You are passing None because that's what your get_html() func might be returning:
if response.status_code == 200:
return response.text
else:
return None
If your function fails to GET a 200 status code, you need to fail, not return None.
Try replacing the entire get_html() func with this:
def get_html(url):
header = {"User Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36'}
response = requests.get(url, headers = header)
response.raise_for_status()
return response.text
This function will raise an exception if the http request failed - otherwise, it will return the html, which you can then feed to BeautifulSoup
with attached screenshot my question can be explained quite well.
I am scraping the following page: https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1
Table 1 lists the team. In the second column is the player. I need the link as you can see in the screenshot on the bottom left.
When I look into the data frame normally, I only get the following in this cell: "Oliver BaumannO. BaumannTorwart" But I am looking for "https://www.transfermarkt.de/oliver-baumann/profil/spieler/55089".
You guys got any ideas?
Code:
import pandas as pd
import requests
# Global variables
HEADS = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
dateiname = 'test.xlsx'
# Global variables
def get_response(url):
# URL-Anfrage durchfuehren
try:
response = requests.get(url, headers=HEADS)
except AttributeError:
print('AttributeError')
return response
def scraping_kader(response):
try:
dfs = pd.read_html(response.text)
#dfs = dfs.to_html(escape=False)
print(dfs[1])
print(dfs[1].iloc[0, :])
except ImportError:
print(' ImportError')
except ValueError:
print(' ValueError')
except AttributeError:
print(' AttributeError')
response = get_response('https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1')
scraping_kader(response)
as I know read_html gets only text from table and it doesn't care of links, hidden elements, attributes, etc.
You need module like BeautifulSoup or lxml to work with full HTML and manually get needed information.
soup = BeautifulSoup(response.text, 'html.parser')
all_tooltips = soup.find_all('td', class_='hauptlink')
for item in all_tooltips:
item = item.find('a', class_='spielprofil_tooltip')
if item:
print(item['href']) #, item.text)
This example gets only links but in the same way you can get other elements.
import requests
from bs4 import BeautifulSoup
#import pandas as pd
HEADS = {
'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
}
def get_response(url):
try:
response = requests.get(url, headers=HEADS)
except AttributeError:
print('AttributeError')
return response
def scraping_kader(response):
try:
soup = BeautifulSoup(response.text, 'html.parser')
all_tooltips = soup.find_all('td', class_='hauptlink')
for item in all_tooltips:
item = item.find('a', class_='spielprofil_tooltip')
if item:
print(item['href']) #, item.text)
#print(dfs[1])
#print(dfs[1].iloc[0, :])
except ImportError:
print(' ImportError')
except ValueError:
print(' ValueError')
except AttributeError:
print(' AttributeError')
# --- main --
response = get_response('https://www.transfermarkt.de/tsg-1899-hoffenheim/kader/verein/533/saison_id/2019/plus/1')
scraping_kader(response)
Result
/oliver-baumann/profil/spieler/55089
/philipp-pentke/profil/spieler/8246
/luca-philipp/profil/spieler/432671
/stefan-posch/profil/spieler/223974
/kevin-vogt/profil/spieler/84435
/benjamin-hubner/profil/spieler/52348
/kevin-akpoguma/profil/spieler/160241
/kasim-adams/profil/spieler/263801
/ermin-bicakcic/profil/spieler/51676
/havard-nordtveit/profil/spieler/42234
/melayro-bogarde/profil/spieler/476915
/konstantinos-stafylidis/profil/spieler/148967
/pavel-kaderabek/profil/spieler/143798
/joshua-brenet/profil/spieler/207006
/florian-grillitsch/profil/spieler/195736
/diadie-samassekou/profil/spieler/315604
/dennis-geiger/profil/spieler/251309
/ilay-elmkies/profil/spieler/443752
/christoph-baumgartner/profil/spieler/324278
/mijat-gacinovic/profil/spieler/215864
/jacob-bruun-larsen/profil/spieler/293281
/sargis-adamyan/profil/spieler/125614
/felipe-pires/profil/spieler/327911
/robert-skov/profil/spieler/270393
/ihlas-bebou/profil/spieler/237164
/andrej-kramaric/profil/spieler/46580
/ishak-belfodil/profil/spieler/111039
/munas-dabbur/profil/spieler/145866
/klauss/profil/spieler/498862
/maximilian-beier/profil/spieler/578392
That helps me.
I have now copied the table with pandas and replaced the column with the name with the link from your BS4 code. Works!
I've created a script in python to get the first 400 links of search results from bing. It's not sure that there will always be at least 400 results. In this case the number of results is around 300. There are 10 results in it's landing page. However, the rest of the results can be found traversing next pages. The problem is when there is no more next page link in there, the webpage displays the last results over and over again.
Search keyword is michael jackson and ths is a full-fledged link
How can I get rid of the loop when there are no more new results or the results are less than 400?`
I've tried with:
import time
import requests
from bs4 import BeautifulSoup
link = "https://www.bing.com/search?"
params = {'q': 'michael jackson','first': ''}
def get_bing_results(url):
q = 1
while q<=400:
params['first'] = q
res = requests.get(url,params=params,headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
})
soup = BeautifulSoup(res.text,"lxml")
for link in soup.select("#b_results h2 > a"):
print(link.get("href"))
time.sleep(2)
q+=10
if __name__ == '__main__':
get_bing_results(link)
As I mentioned in the comments, couldn't you do something like this:
import time
import requests
from bs4 import BeautifulSoup
link = "https://www.bing.com/search?"
params = {'q': 'michael jackson','first': ''}
def get_bing_results(url):
q = 1
prev_soup = str()
while q <= 400:
params['first'] = q
res = requests.get(url,params=params,headers={
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36"
})
soup = BeautifulSoup(res.text,"lxml")
if str(soup) != prev_soup:
for link in soup.select("#b_results h2 > a"):
print(link.get("href"))
prev_soup = str(soup)
else:
break
time.sleep(2)
q+=10
if __name__ == '__main__':
get_bing_results(link)
I'm getting 'HTTP Error 405: Method Not Allowed' error. My code is
import urllib.request
import urllib.parse
try:
url = 'https://www.google.com/search'
values = {'q': 'python programming tutorials'}
data = urllib.parse.urlencode(values)
data = data.encode('utf-8') # data should be bytes
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
req = urllib.request.Request(url, data, headers = headers)
resp = urllib.request.urlopen(req)
print("HERE")
respData = resp.read()
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(respData))
saveFile.close()
except Exception as e:
print(e)
The error I guess is in req = urllib.request.Request(url, data, headers = headers). What is the error, syntactical? What should be changed in code? And any conceptual mistake do correct me.
EDIT
Concept:
def URLRequest(url, params, method="GET"):
if method == "POST":
return urllib2.Request(url, data=urllib.urlencode(params))
else:
return urllib2.Request(url + "?" + urllib.urlencode(params))
You can use Requests library instead. It's much cleaner than urllib
import requests
q = 'Whatever you want to search'
url = 'https://www.google.com/search'
response = requests.get(url+'?'+'q='+q)
saveFile = open('response.txt', 'w')
savefile.write(response.text)
savefile.close()
Or if you want to stick to the urllib , you can do this:
import urllib.request
url = 'https://www.google.com/search'
q = 'Search Query'
headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
request = urllib.request.Request(url+'?'+'q='+q, headers=headers)
response = urllib.request.urlopen(request).read() # the text of the response is here
saveFile = open('withHeaders.txt', 'w')
saveFile.write(str(response))
saveFile.close()
Here in reference to www.pythonforbeginners
# Importing the module
import urllib.request
# your search text
text="hi google"
# Define the url
url = 'http://www.google.com/#q='+text
# Add your headers
headers = {'User-Agent' : 'Mozilla 5.10'}
# Create the Request.
request = urllib.request.Request(url, None, headers)
# Getting the response
response = urllib.request.urlopen(request)
# Print the headers
print (response.read())
I have the code ready for one keyword and its working fine. Next problem is I want to do the scrape for 10 different keywords and save them in one csv file with the keyword name on column/row. I think we can give csv file as input and it picks keyword one by one and does scrape. Here is the code:
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?
rh=n%3A4772060031%2Ck%3Ahelmets+for+men&keywords=helmets+for+men&ie=UTF8"
#excluding page from base_url for further adding
res = []
for page in range(1,3):
request = requests.get(base_url + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id')])
df = pd.DataFrame(data=res, columns=['Asin', 'Result'])
df.to_csv('hel.csv')
I made some sample keywords, replace on needed ones.
import requests
from bs4 import BeautifulSoup
import pandas as pd
base_url = "http://www.amazon.in/s/ref=sr_pg_2?rh=n%3A4772060031%2Ck%3Ahelmets+for+men&ie=UTF8"
keywords_list = ['helmets for men', 'helmets for women']
keyword = 'helmets for men'
#excluding page from base_url for further adding
res = []
for page in range(1,3):
for keyword in keywords_list:
request = requests.get(base_url + '&keywords=' + requests.utils.quote(keyword) + '&page=' + str(page), headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'}) # here adding page
if request.status_code == 404: #added just in case of error
break
soup = BeautifulSoup(request.content, "lxml")
for url in soup.find_all('li', class_ = 's-result-item'):
res.append([url.get('data-asin'), url.get('id'), keyword])
df = pd.DataFrame(data=res, columns=['Asin', 'Result', 'keyword'])
df.to_csv('hel.csv')