I am trying to use the following code to extract all images from the wepage but it gives the error 'Nonetype' object has no attribute 'group'. Can someone tell me what is the problem here?
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[.](jpg|gif|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
EDIT: For context, since the original question has been updated by someone else, and has changed the original code, the original pattern the user was using was r'/([\w_-]+.)$'. This was the original problem. This context will allow the following answer to make more sense:
I went with a pattern like r'/([\w_.-]+)$'. The pattern you were using didn't allow the path to contain a . except as the last character because . outside of [] means any character, and you had it right before $ (the end of the string). So I moved . into [] which means to allow a literal . in the character group. That allowed the pattern to capture image file names at the end of the URL.
import re
import requests
from bs4 import BeautifulSoup
site = 'http://pixabay.com'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_.-]+)$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Related
How do I make it so that each image I garnered from web scraping is then stored to a folder? I use Google Colab currently since I am just practicing stuff. I want to store them in my Google Drive folder.
This is my code for web scraping:
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata('https://www.yahoo.com/')
soup = BeautifulSoup(htmldata, 'html.parser')
imgdata = []
for i in soup.find_all('img'):
imgdata = i['src']
print(imgdata)
I created a pics folder manually in the folder where the script is running to store the pictures in it. Than i changed your code in the for loop so its appending urls to the imgdata list. The try exceptblock is there because not every url in the list is valid.
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata('https://www.yahoo.com/')
soup = BeautifulSoup(htmldata, 'html.parser')
imgdata = []
for i in soup.find_all('img'):
imgdata.append(i['src']) # made a change here so its appendig to the list
filename = "pics/picture{}.jpg"
for i in range(len(imgdata)):
print(f"img {i+1} / {len(imgdata)+1}")
# try block because not everything in the imgdata list is a valid url
try:
r = requests.get(imgdata[i], stream=True)
with open(filename.format(i), "wb") as f:
f.write(r.content)
except:
print("Url is not an valid")
foo.write('whatever')
foo.close()
Currently I have a script that can only download the HTML of a given page.
Now I want to download all the files of the web page including HTML, CSS, JS and image files (same as we get with a ctrl-s of any website).
My current code is:
import urllib
url = "https://en.wikipedia.org/wiki/Python_%28programming_language%29"
urllib.urlretrieve(url, "t3.html")
I visited many questions but they are all only downloading the HTML.
The following implementation enables you to get the sub-HTML websites. It can be more developed in order to get the other files you need. I sat the depth variable for you to set the maximum sub_websites that you want to parse to.
import urllib2
from BeautifulSoup import *
from urlparse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print "Could not open %s" % page
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=2)
print urls
Python3 version, 2019. May this saves some time to somebody:
#!/usr/bin/env python
import urllib.request as urllib2
from bs4 import *
from urllib.parse import urljoin
def crawl(pages, depth=None):
indexed_url = [] # a list for the main and sub-HTML websites in the main website
for i in range(depth):
for page in pages:
if page not in indexed_url:
indexed_url.append(page)
try:
c = urllib2.urlopen(page)
except:
print( "Could not open %s" % page)
continue
soup = BeautifulSoup(c.read())
links = soup('a') #finding all the sub_links
for link in links:
if 'href' in dict(link.attrs):
url = urljoin(page, link['href'])
if url.find("'") != -1:
continue
url = url.split('#')[0]
if url[0:4] == 'http':
indexed_url.append(url)
pages = indexed_url
return indexed_url
pagelist=["https://en.wikipedia.org/wiki/Python_%28programming_language%29"]
urls = crawl(pagelist, depth=1)
print( urls )
You can easily do that with simple python library pywebcopy.
For Current version: 5.0.1
from pywebcopy import save_webpage
url = 'http://some-site.com/some-page.html'
download_folder = '/path/to/downloads/'
kwargs = {'bypass_robots': True, 'project_name': 'recognisable-name'}
save_webpage(url, download_folder, **kwargs)
You will have html, css, js all at your download_folder. Completely working like original site.
Using Python 3+ Requests and other standard libraries.
The function savePage receives a requests.Response and the pagefilename where to save it.
Saves the pagefilename.html on the current folder
Downloads, javascripts, css and images based on the tags script, link and img and saved on a folder pagefilename_files.
Any exception are printed on sys.stderr, returns a BeautifulSoup object .
Requests session must be a global variable unless someone writes a cleaner code here for us.
You can adapt it to your needs.
import os, sys
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
def soupfindAllnSave(pagefolder, url, soup, tag2find='img', inner='src'):
if not os.path.exists(pagefolder): # create only once
os.mkdir(pagefolder)
for res in soup.findAll(tag2find): # images, css, etc..
try:
filename = os.path.basename(res[inner])
fileurl = urljoin(url, res.get(inner))
# rename to saved file path
# res[inner] # may or may not exist
filepath = os.path.join(pagefolder, filename)
res[inner] = os.path.join(os.path.basename(pagefolder), filename)
if not os.path.isfile(filepath): # was not downloaded
with open(filepath, 'wb') as file:
filebin = session.get(fileurl)
file.write(filebin.content)
except Exception as exc:
print(exc, file=sys.stderr)
return soup
def savePage(response, pagefilename='page'):
url = response.url
soup = BeautifulSoup(response.text)
pagefolder = pagefilename+'_files' # page contents
soup = soupfindAllnSave(pagefolder, url, soup, 'img', inner='src')
soup = soupfindAllnSave(pagefolder, url, soup, 'link', inner='href')
soup = soupfindAllnSave(pagefolder, url, soup, 'script', inner='src')
with open(pagefilename+'.html', 'w') as file:
file.write(soup.prettify())
return soup
Example saving google page and its contents (google_files folder)
session = requests.Session()
#... whatever requests config you need here
response = session.get('https://www.google.com')
savePage(response, 'google')
Try the Python library Scrapy. You can program Scrapy to recursively scan a website by downloading its pages, scanning, following links:
An open source and collaborative framework for extracting the data you need from websites. In a fast, simple, yet extensible way.
If I write http://www.chictopia.com/photo/show/3
I can get proper image file.
However, if I set range to crawl image within multiple web page with using for loop
I can't get image file it seems 0bytes file is downloaded
f'http://www.chictopia.com/photo/show/+{x}
why I can get 0bytes image file and could anyone explain how to parse image of multiple page.
Thank you
import re
import requests
from bs4 import BeautifulSoup
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
print (response)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.search(r'/([\w_-]+[400]+[.](jpg))$', url)
if not filename:
print("fail".format(url))
continue
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
url = '{}{}'.format(response, url)
response = requests.get(url)
f.write(response.content)
try this
modified the regex pattern and used changed the call to proper image url.
now this code will save all the images containing _400.jpg in their link as following name.
import re
import requests
from bs4 import BeautifulSoup
import shutil
for x in range (3,6):
response = requests.get(f'http://www.chictopia.com/photo/show/+{x}')
# print (response.status_code)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img')
urls = [img['src'] for img in img_tags]
for url in urls:
filename = re.findall(r'(.+_400\.jpg)', url)
if len(filename) != 0:
image = filename[0]
image_name = f"image_{image.split('/')[-1]}"
response = requests.get(image, stream=True)
with open(image_name, 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
print(f'Saved : {image_name}')
for example.
http://images2.chictopia.com/photos/mikajones/2162299642/2162299642_400.jpg -> as image_2162299642_400.jpg
now whats wrong with your code:
you used wrong regex and took the matching value which is not the complete url of the image that's why you are getting the null value for image size (you are not even calling the image url).
all fixed.
Hello Community I have a problem and I dont know how to solve it my problem is I write a script to crawl webpages for Images with BeautifuleSoup4 but I got the error (AttributeError: 'NoneType' object has no attribute 'group')
import re
import requests
from bs4 import BeautifulSoup
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
img_tags = soup.find_all('img', {"src": True})
urls = [img["src"] for img in img_tags]
for url in urls:
filename = re.search(r'([\w_-]+[.](jpg|png))$', url)
with open(filename.group(1), 'wb') as f:
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
Your regex is wrong. Use Python's internal urllib to do the heavyweight lifting instead of writing regexes if you're not familiar with them.
Use something like this (untested):
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlsplit # import this additional library
from os.path import basename # import this additional library
site = 'https://www.fotocommunity.de/natur/wolken/3144?sort=new'
response = requests.get(site)
soup = BeautifulSoup(response.text, 'html.parser')
images_div = soup.find(id=re.compile(r"fcx-gallery-\w+")) # focus on the div containing the images
if img_tags: # test if img_tags has any data
img_tags = images_div.find_all('img', {"data-src": True}) # get all the images in that div
urls = [img["data-src"] for img in img_tags] # grab sources from data-source
for url in urls:
filename = basename(urlsplit(url).path) # use this instead of a regex
with open(filename, 'wb') as f: # filename is now a string
if 'http' not in url:
# sometimes an image source can be relative
# if it is provide the base url which also happens
# to be the site variable atm.
url = '{}{}'.format(site, url)
response = requests.get(url)
f.write(response.content)
I am building the crawler in python and i have the list of href from the page.
Now i have the list of file extensions to download like
list = ['zip','rar','pdf','mp3']
How can i save the files from that url to local directory using python
EDIT:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
Going by your posted example:
import urllib2
from bs4 import BeautifulSoup
url = "http://www.example.com/downlaod"
site = urllib2.urlopen(url)
html = site.read()
soup = BeautifulSoup(html)
list_urls = soup.find_all('a')
print list_urls[6]
So, the URL you want to fetch next is presumably list_urls[6]['href'].
The first trick is that this might be a relative URL rather than absolute. So:
newurl = list_urls[6]['href']
absurl = urlparse.urljoin(site.url, newurl)
Also, you want to only fetch the file if it has the right extension, so:
if not absurl.endswith(extensions):
return # or break or whatever
But once you've decided what URL you want to download, it's no harder than your initial fetch:
page = urllib2.urlopen(absurl)
html = page.read()
path = urlparse.urlparse(absurl).path
name = os.path.basename(path)
with open(name, 'wb') as f:
f.write(html)
That's mostly it.
There are a few things you might want to add, but if so, you have to add them all manually. For example:
Look for a Content-disposition header with a suggested filename to use in place of the URL's basename.
copyfile from page to f instead of reading the whole thing into memory and then writeing it out.
Deal with existing files with the same name.
…
But that's the basics.
You can use python requests library as you have asked in question : http://www.python-requests.org
You can save file from url like this :
import requests
url='http://i.stack.imgur.com/0LJdh.jpg'
data=requests.get(url).content
filename="image.jpg"
with open(filename, 'wb') as f:
f.write(data)
solution using urllib3
import os
import urllib3
from bs4 import BeautifulSoup
import urllib.parse
url = "https://path/site"
site = urllib3.PoolManager()
html = site.request('GET', url)
soup = BeautifulSoup(html.data, "lxml")
list_urls = soup.find_all('a')
and then a recursive function to get all the files
def recursive_function(list_urls)
newurl = list_urls[0]['href']
absurl = url+newurl
list_urls.pop(0)
if absurl.endswith(extensions): # verify if contains the targeted extensions
page = urllib3.PoolManager()
html = site.request('GET', absurl)
name = os.path.basename(absurl)
with open(name, 'wb') as f:
f.write(html.data)
return recursive_function(list_urls)