MissingSchema: Invalid URL "/": No schema supplied

MissingSchema: Invalid URL "/": No schema supplied - python

I want to retrieve data from google links so I followed this but the above error is coming.
from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.parse import quote
qstr = quote("postal code paris")
url_getallfolders = "https://www.google.co.in/?client=safari"+qstr
x = urlopen(url_getallfolders)
data = x.read()
import requests
response=requests.get(x)
response.content

Why you're passing X ? try to get the type of x type(x) which is the HTTP Client response object not an URL. <class 'http.client.HTTPResponse'>
from urllib.request import urlopen
from urllib.request import urlretrieve
from urllib.parse import quote
qstr = quote("postal code paris")
url_getallfolders = "https://www.google.co.in/?client=safari"+qstr
x = urlopen(url_getallfolders)
data = x.read()
import requests
response=requests.get(url_getallfolders) # X is not an URL
response.content

It is saying that you have an invalid URL.
Try to just print the URL to make sure they are getting read correctly before passing them to requests.

I think you are mixing frameworks here.
Requests is one library
and urllib.request
is a different one. If you print data then you will see the correct html document.
these are two alternative approaches not to be mixed.
alternative 1:
req = urllib.request.Request('https://www.google.co.in/search?q=searc',headers={'User-Agent': 'Mozilla/5.0'})
x = urllib.request.urlopen(req)
data = x.read()
print(data)
alternative 2
import requests
response=requests.get(req.full_url)
response.content

Related

In Web_Scraping the list is returning empty list

import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
url = "https://www.covid19india.org/"
headers = {"Accept-Language":"en-US, en;q=0.5"}
results = requests.get(url,headers = headers)
soup = BeautifulSoup(results.text,"html.parser")
cases_div = soup.find_all('div', class_="Level1")
print(cases_div)
My expect output is the HTML
However, I am getting an empty list while printing cases_div.
Why is that and how can I fix it?

It seems that the specified website uses React, and on the first HTTP request you will not get the whole content.
Try to use selenium or try to find the API requests to the server as was suggested in the comment.

Python: (BeautifulSoup) Requesting Info with Class Gives Me '[ ]'

When requesting a website using class='even', I end up just receiving '[]' as my result.
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
url = 'https://www.worldometers.info/coronavirus/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.findAll('tr', class_='even'))
This is my result
[]
I tried looking in a lot of places but I couldn't find out why. The HTML code is really long as there is a lot of data.

I am not sure that this is the universal solution, but this is what worked for my project. I used #ahmedamerican's solution with a few changes in order to fix it.
import requests
import pandas as pd
import lxml
r = requests.get("https://www.worldometers.info/coronavirus/")
df = pd.read_html(r.content)[0]
print(df)
Just instead of doing print(type(df)) as he said, I did print(df).

BeautifulSoup doesn't extract the table

import urllib.request as urllib2 #To query website
from bs4 import BeautifulSoup #To parse website
import pandas as pd
#specify the url and open
url3 = 'http://www.thatscricket.com/ipl/2014/results/index.html'
req = urllib2.urlopen(url3)
soup = BeautifulSoup(req,"html5lib")
all_tables=soup.find_all('table')
print(all_tables)

If you see the content of your requested data
content = req.readall()
as you examine the content:
print(content)
and surprisingly there is not table!!!
But if you check the page source you can see tables in it.
As I examined there should be some problem with urllib.request and there is some escape sequence on the page which causes that urllib get only part of that page.
So I could be able to fix the problem by using requests instead of urllib
first
pip install requests
Then change your code to this:
import requests
from bs4 import BeautifulSoup
url3 = 'http://www.thatscricket.com/ipl/2014/results/index.html'
req = requests.get(url3)
soup = BeautifulSoup(req.content,"html5lib")
all_tables=soup.find_all('table')
print(all_tables)

Python urllib is not extracting reader comments from a website

I am trying to extract reader comments from the following page with the code shown below. But the output html test.html does not contain any comments from the page. How do I get this information with Python?
http://www.theglobeandmail.com/opinion/it-doesnt-matter-who-won-the-debate-america-has-already-lost/article32314064/comments/
from bs4 import BeautifulSoup
import urllib
import urllib.request
import urllib.parse
req =urllib.request.Request('http://www.theglobeandmail.com/opinion/it-doesnt-matter-who-won-the-debate-america-has-already-lost/article32314064/comments/')
response = urllib.request.urlopen(req)
the_page = response.read()
soup = BeautifulSoup(the_page, 'html.parser')
f = open('test.html', 'w')
f.write(soup.prettify())
f.close()
Thanks!

The comments are retrieved using an ajax requests which you can mimic:
You can see there are numerous parameters but what is below is enough to get a result, I will leave it to you to figure out how you can influence the results:
from json import loads
from urllib.request import urlopen
from urllib.parse import urlencode
data = {"categoryID":"Production",
"streamID":"32314064",
"APIKey":"2_oNjjtSC8Qc250slf83cZSd4sbCzOF4cCiqGIBF8__5dWzOJY_MLAoZvds76cHeQD",
"callback" :"foo",}
r = urlopen("http://comments.us1.gigya.com/comments.getComments", data=urlencode(data).encode("utf-8"))
json_dcts = loads(r.read().decode("utf-8"))["comments"]
print(json_dcts)
That gives you a list of dicts that hold all the comments, upvotes, negvotes etc.. If you wanted to parse the key it is in the url of inside one of th scripts src='https://cdns.gigya.com/js/socialize.js?apiKey=2_oNjjtSC8Qc250slf83cZSd4sbCzOF4cCiqGIBF8__5dWzOJY_MLAoZvds76cHeQD', the streamID is in your original url.

How can I read the contents of an URL with Python?

The following works when I paste it on the browser:
http://www.somesite.com/details.pl?urn=2344
But when I try reading the URL with Python nothing happens:
link = 'http://www.somesite.com/details.pl?urn=2344'
f = urllib.urlopen(link)
myfile = f.readline()
print myfile
Do I need to encode the URL, or is there something I'm not seeing?

To answer your question:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.read()
print(myfile)
You need to read(), not readline()
EDIT (2018-06-25): Since Python 3, the legacy urllib.urlopen() was replaced by urllib.request.urlopen() (see notes from https://docs.python.org/3/library/urllib.request.html#urllib.request.urlopen for details).
If you're using Python 3, see answers by Martin Thoma or i.n.n.m within this question:
https://stackoverflow.com/a/28040508/158111 (Python 2/3 compat)
https://stackoverflow.com/a/45886824/158111 (Python 3)
Or, just get this library here: http://docs.python-requests.org/en/latest/ and seriously use it :)
import requests
link = "http://www.somesite.com/details.pl?urn=2344"
f = requests.get(link)
print(f.text)

For python3 users, to save time, use the following code,
from urllib.request import urlopen
link = "https://docs.scipy.org/doc/numpy/user/basics.broadcasting.html"
f = urlopen(link)
myfile = f.read()
print(myfile)
I know there are different threads for error: Name Error: urlopen is not defined, but thought this might save time.

None of these answers are very good for Python 3 (tested on latest version at the time of this post).
This is how you do it...
import urllib.request
try:
with urllib.request.urlopen('http://www.python.org/') as f:
print(f.read().decode('utf-8'))
except urllib.error.URLError as e:
print(e.reason)
The above is for contents that return 'utf-8'. Remove .decode('utf-8') if you want python to "guess the appropriate encoding."
Documentation:
https://docs.python.org/3/library/urllib.request.html#module-urllib.request

A solution with works with Python 2.X and Python 3.X makes use of the Python 2 and 3 compatibility library six:
from six.moves.urllib.request import urlopen
link = "http://www.somesite.com/details.pl?urn=2344"
response = urlopen(link)
content = response.read()
print(content)

We can read website html content as below :
from urllib.request import urlopen
response = urlopen('http://google.com/')
html = response.read()
print(html)

#!/usr/bin/python
# -*- coding: utf-8 -*-
# Works on python 3 and python 2.
# when server knows where the request is coming from.
import sys
if sys.version_info[0] == 3:
from urllib.request import urlopen
else:
from urllib import urlopen
with urlopen('https://www.facebook.com/') as \
url:
data = url.read()
print data
# When the server does not know where the request is coming from.
# Works on python 3.
import urllib.request
user_agent = \
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
url = 'https://www.facebook.com/'
headers = {'User-Agent': user_agent}
request = urllib.request.Request(url, None, headers)
response = urllib.request.urlopen(request)
data = response.read()
print data

from urllib.request import urlopen
# if has Chinese, apply decode()
html = urlopen("https://blog.csdn.net/qq_39591494/article/details/83934260").read().decode('utf-8')
print(html)

from urllib.request import urlopen
from bs4 import BeautifulSoup
link = "https://www.timeshighereducation.com/hub/sinorbis"
f = urlopen(link)
soup = BeautifulSoup(f, 'html.parser')
# get the text content of the webpage
text = soup.get_text()
print(text)
using BeautifulSoup's HTML parser we can extract the content of the webpage.

I used the following code:
import urllib
def read_text():
quotes = urllib.urlopen("https://s3.amazonaws.com/udacity-hosted-downloads/ud036/movie_quotes.txt")
contents_file = quotes.read()
print contents_file
read_text()

# retrieving data from url
# only for python 3
import urllib.request
def main():
url = "http://docs.python.org"
# retrieving data from URL
webUrl = urllib.request.urlopen(url)
print("Result code: " + str(webUrl.getcode()))
# print data from URL
print("Returned data: -----------------")
data = webUrl.read().decode("utf-8")
print(data)
if __name__ == "__main__":
main()

The URL should be a string:
import urllib
link = "http://www.somesite.com/details.pl?urn=2344"
f = urllib.urlopen(link)
myfile = f.readline()
print myfile

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

MissingSchema: Invalid URL "/": No schema supplied - python

It is saying that you have an invalid URL. Try to just print the URL to make sure they are getting read correctly before passing them to requests.

Related

In Web_Scraping the list is returning empty list

Python: (BeautifulSoup) Requesting Info with Class Gives Me '[ ]'

BeautifulSoup doesn't extract the table

Python urllib is not extracting reader comments from a website

How can I read the contents of an URL with Python?

Categories

Resources