HTTP Error 403: Forbidden with Tabula/Requests [closed] - python

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
I am getting the error "urllib.error.HTTPError: HTTP Error 403: Forbidden" with Tabula, is there a way to fix this? It has worked correctly for most of this year:
import tabula
from bs4 import BeautifulSoup
import requests
url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')
for hyperlink_tag in hyperlink_tags:
if 'Situation report' in hyperlink_tag.text:
file_path = hyperlink_tag['href']
break
latest_report = f'https://who.int/{file_path}'
file = latest_report
tables = tabula.read_pdf(file, stream=True, pages = "all", multiple_tables = True)
The problem seems to be the last line so I'm not sure if it's requests or tabula

the request needs the headers parameter for the User-Agent. not sure how to add that parameter with tabula, but you can access and write the pdf to file, then read that in:
import tabula
from bs4 import BeautifulSoup
import requests
url = 'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/situation-reports'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml' )
hyperlink_tags = soup.find_all('a')
for hyperlink_tag in hyperlink_tags:
if 'Situation report' in hyperlink_tag.text:
file_path = hyperlink_tag['href']
break
latest_report = f'https://who.int/{file_path}'
file = latest_report
################################################
## Download the PDF ############################
from urllib.request import Request, urlopen
f = open('c:/test/temp.pdf', 'wb')
url_request = Request(file,
headers={"User-Agent": "Mozilla/5.0"})
webpage = urlopen(url_request).read()
f.write(webpage)
f.close()
#################################################
tables = tabula.read_pdf('c:/test/temp.pdf', stream=False, pages = "all", multiple_tables = True)

Related

Downloading files from web [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 5 days ago.
Improve this question
i need first 5 text files from URL using python: http://www.textfiles.com/etext/AUTHORS/SHAKESPEARE/ , only '.txt' files should be dowloaded and store it in folder
I tried using requests lib to get access to website
Here I've done it using requests to get URLs content, and BeautifulSoup to retrieve urls to download .txt's from main page
Download page content using requests
Using BeautifulSoup, find all <a> tags
Get first 5 tags that ends up with .txt
Download content of those tags href using requests
import requests
from bs4 import BeautifulSoup
url = "http://www.textfiles.com/etext/AUTHORS/SHAKESPEARE/"
AMOUNT_OF_FILES = 5 # Amount of txt files to download
FILES_EXTENSION = ".txt" # Extension to download
# Getting url content
req = requests.get(url)
soup = BeautifulSoup(req.text, "html.parser")
# Finding all a tags in table
a_tags = soup.find("table").find_all("a")
urls_to_download = []
# Getting urls to download .txt`s
for a_tag in a_tags:
if a_tag['href'].endswith(FILES_EXTENSION):
urls_to_download.append(url + a_tag['href'])
if len(urls_to_download) == AMOUNT_OF_FILES:
break
# Downloading file contents
for url in urls_to_download:
filename = url[url.rindex("/")+1:]
request_url = requests.get(url)
with open(filename, "wb") as file:
file.write(request_url.content)

in python, I have 100 page's link and want to save them as html [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 2 years ago.
Improve this question
100 page's links inside (links.txt)
This is the code I have so far (it is save only one page) but the part of saving all the 99 pages is missing
import requests
import urllib.request, urllib.error, urllib.parse
with open('links.txt', 'r') as links:
for link in links:
response = urllib.request.urlopen(link)
webContent = response.read()
f = open('obo-t17800628-33.html', 'wb')
f.write(webContent)
f.close
You need to give the files different names as you loop:
import requests
import urllib.request, urllib.error, urllib.parse
with open('links.txt', 'r') as links:
for idx, link in enumerate(links):
response = urllib.request.urlopen(link)
webContent = response.read()
with open('obo-t17800628-33.html' + str(idx), 'wb') as fout:
fout.write(webContent)
This will append a number to the end of each file name.

How extract URL from web page [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 3 years ago.
Improve this question
I need a way to extract url from the list at this web page https://iota-nodes.net/
using Python. I tried BeautifulSoup but without success.
My code is:
from bs4 import BeautifulSoup, SoupStrainer
import requests
url = "https://iota-nodes.net/"
page = requests.get(url)
data = page.text
soup = BeautifulSoup(data)
for link in soup.find_all('a'):
print(link.get('href'))
No need for BeautifulSoup, as the data is coming from an AJAX request. Something like this should work:
import requests
response = requests.get('https://api.iota-nodes.net/')
data = response.json()
hostnames = [node['hostname'] for node in data]
Note that the data comes from an API endpoint being https://api.iota-nodes.net/.

searching through html with python [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 4 years ago.
Improve this question
My question is about searching through html format with Python.
I am using this code:
with urllib.request.urlopen("http://") as url:
data = url.read().decode()
now this returns the whole HTML code from the page and I want to extract all email-addresses.
Can somebody lend me a hand here?
Thanks in advance
Using beautifulsoup BeautifulSoup And Requests you could do this:
import requests
from bs4 import BeautifulSoup
import re
response = requests.get("your_url")
response_text = response.text
beautiful_response = BeautifulSoup(response_text, 'html.parser')
email_regex = r'[a-zA-Z0-9_.+-]+#[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
list_of_emails = re.findall(email_regex, beautiful_response .text)
list_of_emails_decoded = []
for every_email in list_of_emails:
list_of_emails_decoded.append(every_email.encode('utf-8'))
Remember that you should not use regex for actual HTML parsing (Thanks #Patrick Artner), but you can use beautiful soup to extract all visible text or comments on a web page. Then you can use this text (which is just a string) to look for email addresses. Here is how you can do it:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib
import re
def tag_visible(element):
if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(body):
soup = BeautifulSoup(body, 'html.parser')
texts = soup.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return u" ".join(t.strip() for t in visible_texts)
with urllib.request.urlopen("https://en.wikipedia.org/wiki/Email_address") as url:
data = url.read().decode()
text = text_from_html(data)
print(re.findall(r"[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+#[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*", text))
The two helper functions just grab all text that can be seen on the page, and then the ridiculously long regex just pulls all email addresses from that text. I used wikipedia.com's article on emails as an example, and here is the output:
['John.Smith#example.com', 'local-part#domain', 'jsmith#example.com', 'john.smith#example.org', 'local-part#domain', 'John..Doe#example.com', 'fred+bah#domain', 'fred+foo#domain', 'fred#domain', 'john.smith#example.com', 'john.smith#example.com', 'jsmith#example.com', 'JSmith#example.com', 'john.smith#example.com', 'john.smith#example.com', 'prettyandsimple#example.com', 'very.common#example.com', 'disposable.style.email.with+symbol#example.com', 'other.email-with-dash#example.com', 'fully-qualified-domain#example.com', 'user.name+tag+sorting#example.com', 'user.name#example.com', 'x#example.com', 'example-indeed#strange-example.com', 'admin#mailserver1', "#!$%&'*+-/=?^_`{}|~#example.org", 'example#s.solutions', 'user#localserver', 'A#b', 'c#example.com', 'l#example.com', 'right#example.com', 'allowed#example.com', 'allowed#example.com', '1234567890123456789012345678901234567890123456789012345678901234+x#example.com', 'john..doe#example.com', 'example#localhost', 'john.doe#example', 'joeuser+tag#example.com', 'joeuser#example.com', 'foo+bar#example.com', 'foobar#example.com']

Find hyperlinks of a page in python without BeautifulSoup [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 7 years ago.
Improve this question
What I am trying to do is find all the hyper links of a web page here is what I have so far but it does not work
from urllib.request import urlopen
def findHyperLinks(webpage):
link = "Not found"
encoding = "utf-8"
for webpagesline in webpage:
webpagesline = str(webpagesline, encoding)
if "<a href>" in webpagesline:
indexstart = webpagesline.find("<a href>")
indexend = webpagesline.find("</a>")
link = webpagesline[indexstart+7:indexend]
return link
return link
def main():
address = input("Please enter the adress of webpage to find the hyperlinks")
try:
webpage = urlopen(address)
link = findHyperLinks(webpage)
print("The hyperlinks are", link)
webpage.close()
except Exception as exceptObj:
print("Error:" , str(exceptObj))
main()
There are multiple problems in your code. One of them is that you are trying to find links with present, empty and the only one href attribute: <a href>.
Anyway, if you would use an HTML parser (well, to parse HTML), things would get much more easy and reliable. Example using BeautifulSoup:
from bs4 import BeautifulSoup
from urllib.request import urlopen
soup = BeautifulSoup(urlopen(address))
for link in soup.find_all("a", href=True):
print(link["href"], link.get_text())
Without BeautifulSoap you can use RegExp and simple function.
from urllib.request import urlopen
import re
def find_link(url):
response = urlopen(url)
res = str(response.read())
my_dict = re.findall('(?<=<a href=")[^"]*', res)
for x in my_dict:
# simple skip page bookmarks, like #about
if x[0] == '#':
continue
# simple control absolute url, like /about.html
# also be careful with redirects and add more flexible
# processing, if needed
if x[0] == '/':
x = url + x
print(x)
find_link('http://cnn.com')

Categories

Resources