Unknown URL Type: Image Scraping - python

My error:
File "C:/Users/hp dv4/PycharmProjects/project/imagescrap.py", line
22, in
imagefile.write(urllib.request.urlopen(img_src).read())
ValueError: unknown url type: '/img/logo_with_text.png'
I am getting this error while crawling through the specified website whereas, this same code works fine with some other website.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
i = 1
soup = make_soup("http://ioe.edu.np/")
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for img_src in unique_srcs:
filename = str(i)
i = i + 1
imagefile = open(filename + '.png', 'wb')
imagefile.write(urllib.request.urlopen(img_src).read())
imagefile.close()

the above code will encounter one more error.
you are trying to save every file with .png extension, which may make the files unreadable.
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
base_url = "http://ioe.edu.np/"
soup = make_soup(base_url)
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
for i, img_src in enumerate(unique_srcs):
print(img_src)
filename = str(i)
extension = img_src.split('.')[-1]
with open(filename+'.'+extension, 'wb') as f:
f.write(urllib.request.urlopen(base_url+img_src).read())
few idiomatic python suggestions:
use enumerate instead of trying to manage a counter.
use the with-open construct which takes care of closing your file.
one other thing you could do to further improve:
use a set instead of a list, so that you don't download the same file twice.

As the error messages says:
unknown url type: '/img/logo_with_text.png'
add http://ioe.edu.np/ in front of img_src and it should work

Related

Downloading multiple files from a URL Directory with a similar name

I am trying to download multiple .csv files from an url directory with similar names (AB_daily.csv, BC_daily.csv, etc.). However, each file is stored in different folders in the directory. I know there is a way to use a loop to extract the files, but I can't figure out how to do it with Beautiful Soup or glob. Do you have any suggestions? I've also used pandas.read_csv() to look for shortcuts as I'm just trying to concatenate the files together later. Thank you.
URL Directory: https://dd.weather.gc.ca/hydrometric/csv/
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = [f"{url[:20]}{item['href']}" for item in soup.select(
"a[href$='AB_daily_hydrometric.csv']")
for x in target:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv')
# For this specific task this will work:
import os
import requests
import urllib.request
from bs4 import BeautifulSoup
csv_links = []
links = ["AB/", "BC/", "MB/", "NB/", "NL/", "NS/", "NT/", "NU/" "ON/", " PE/", "QC/", "SK/", "YT/"]
def main(url):
with requests.Session() as req:
r = req.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
for item in links:
r_daily = BeautifulSoup(req.get(url + item + "daily/").content, 'html.parser')
r_hourly = BeautifulSoup(req.get(url + item + "hourly/").content, 'html.parser')
for item_d in r_daily.find_all('a'):
if ".csv" in item_d.get('href'):
csv_links.append(url + item + "daily/" +item_d.get('href'))
for item_h in r_hourly.find_all('a'):
if ".csv" in item_h.get('href'):
csv_links.append(url + item + "hourly/" + item_h.get('href'))
for x in csv_links:
print(f"Downloading {x}")
r = req.get(x)
name = x.rsplit("/", 1)[-1]
with open(name, 'wb') as f:
f.write(r.content)
main('https://dd.weather.gc.ca/hydrometric/csv/')

File names are cut while download to folder

My current code is cutting first 6 characters from file names while downloading PDF's. So for example PDF file name is 123456acII.pdf (https://example.com/wp-content/uploads/2016/11/123456acII.pdf) but file in folder is acII.pdf.
How to make names be as they are?
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
main = "https://example.com/"
#If there is no such folder, the script will create one automatically
folder_location = r'C:\temp\webscraping'
if not os.path.exists(folder_location):os.mkdir(folder_location)
def Get_Links():
r = requests.get(main).text
soup = BeautifulSoup(r, 'html.parser')
links = []
for item in soup.findAll("div", {'class': 'large-4 medium-4 columns'}):
for n in item.find_all('a'):
print ('Link: '+ n.get('href'))
links.append(n.get('href'))
return links
def Parse_Links():
pdf = set()
for url in Get_Links():
r = requests.get(url).text
soup = BeautifulSoup(r, 'html.parser')
for item in soup.findAll("div", {'class': 'large-6 medium-8 columns large-centered'}):
for link in item.findAll("a"):
link = link.get("href")
if link:
pdf.add(link)
return pdf
def Save():
for item in Parse_Links():
print(f"Downloading File: {item[55:]}")
filename = os.path.join(folder_location,f"{item[55:]}")
r = requests.get(item)
with open(filename, 'wb') as f:
f.write(r.content)
print("done")
Save()
It looks like you are slicing the string starting at index position 55 {item[55:]}. Try to see if it's simply just starting your index position 6 positions prior:
change to: {item[49:]}

Python WebScraper - object has no attribute 'urlretrieve'

I am trying to create a python webscraper that downloads a certain amount of images from a url, to my current directory. However for the following code:
urllib.request.urlretrieve(each, filename)
It is saying that: AttributeError: 'function' object has no attribute 'urlretrieve' when running the program
Here is the full code:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
url = 'https://unsplash.com/s/photos/download'
def download_imgs(url, amountOfImgs):
html = urlopen(url).read()
#parsing the html from the url
page_soup = soup(html, "html.parser")
images = [img for img in page_soup.findAll('img')]
counter = 0
#compiling the unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
if(counter <= amountOfImgs):
filename = each.split('/')[-1]
urllib.request.urlretrieve(each, filename)
counter += 1
else:
return image_links
print(download_imgs(url, 5))
It looks like when you imported just URLOpen, you missed everything else.
I did it a bit differently, I got the html using the requests.get method, and removed the need for url open, you could just do
import urlopen, urlretrieve
if you want to use mine, I know it worked,
import urllib.request
from bs4 import BeautifulSoup as soup
import requests
url = 'https://unsplash.com/s/photos/download'
def download_imgs(url, amountOfImgs):
req=requests.get(url)
html=req.text
#parsing the html from the url
page_soup = soup(html, "html.parser")
images = [img for img in page_soup.findAll('img')]
counter = 0
#compiling the unicode list of image links
image_links = [each.get('src') for each in images]
for each in image_links:
if(counter <= amountOfImgs):
filename = each.split('/')[-1]
urllib.request.urlretrieve(each, filename)
counter += 1
else:
return image_links
print(download_imgs(url, 5))

Image saved twice using Beautiful Soup

With the following code, every image is saved two times. How can I skip the image that is already saved?
import urllib.request
from bs4 import BeautifulSoup
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
i = 1
soup = make_soup("https://www./")
for img in soup.findAll('img'):
temp = img.get('src')
image = temp
if str(image):
filename = str(i)
i = i + 1
imagefile = open(filename + '.png', 'wb')
imagefile.write(urllib.request.urlopen(image).read())
imagefile.close()
You can use a set structure to filter out the duplicates like so:
unique_srcs = list(set([img.get('src') for img in soup.findAll('img')]))
for img_src in unique_srcs:
filename = str(i)
i = i + 1
imagefile = open(filename + '.png', 'wb')
imagefile.write(urllib.request.urlopen(img_src).read())
imagefile.close()
Now, bear in mind that this might change the order of the files. If you can't afford to change the order, you can achieve the same by looping through the soup.findAll() list and checking whether each element's src is not in an another unique_list, then appending that element to that unique_list of srcs and then looping over it like I did.
EDIT:
To keep the order, use this code instead of the list comprehension for unique_srcs array.
unique_srcs = []
for img in soup.findAll('img'):
if img.get('src') not in unique_srcs:
unique_srcs.append(img.get('src'))
Your first element would then be unique_srcs[0] (logo in your case).

How can I export an web-scraping table into csv with multiple rows?

I wrote this code on Python 2.7.13, for scraping datatable from a website.
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+","+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem,Devizanev,Egyseg,Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()
I want to export this into CSV with the next 4 rows:
"Penznem Devaizanev Egyseg Penznemforintban"
The data are separated with "," but the last two values is ONE row. (283,45)
How can I fix it?
you can not avoid last coma directly but,
What you can simply do is to use another seprator i.e. ;(semicolon)
and when you open file in exel,calc select (;)semicolon as seprator and you will get result as expected!
import urllib2
from bs4 import BeautifulSoup
import csv
import os
out=open("proba.csv","rb")
data=csv.reader(out)
def make_soup(url):
thepage = urllib2.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
maindatatable=""
soup = make_soup("https://www.mnb.hu/arfolyamok")
for record in soup.findAll('tr'):
datatable=""
for data in record.findAll('td'):
datatable=datatable+";"+data.text
maindatatable = maindatatable + "\n" + datatable[1:]
header = "Penznem;Devizanev;Egyseg;Penznemforintban"
print maindatatable
file = open(os.path.expanduser("proba.csv"),"wb")
utf16_str1 =header.encode('utf16')
utf16_str2 = maindatatable.encode('utf16')
file.write(utf16_str1)
file.write(utf16_str2)
file.close()

Categories

Resources