Image src is not completely scraped.(../../../images/stat/popolazione.jpg) - python

I am using the following code and I am unable to get complete src of image.
from requests import get
from bs4 import BeautifulSoup
import re
def statistics(url):
resp = get(url)
soup = BeautifulSoup(resp.text, "lxml")
reqTable = soup.find('table', class_ = 'tabwrap')
images = reqTable.findAll('img')
for img in images:
print(img['src'])
statistics('http://www.comuni-italiani.it/084/001/statistiche/')
output:
This is image src i want to scrape:
Link to website: http://www.comuni-italiani.it/084/001/statistiche/
I want to get this Popolazione number out of the image src so that's why I need that src so i can split and get that number 59.605

Try the following to get the numbers available in those image links:
import requests
from bs4 import BeautifulSoup
def get_image_links(url):
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "lxml")
for img in soup.select('table.tabwrap img[src*="d_fnote_title"]'):
item_num = img['src'].split("h|")[1].split("|")[0]
print(item_num)
if __name__ == '__main__':
link = 'http://www.comuni-italiani.it/084/001/statistiche/'
get_image_links(link)

Related

How to open all href within a div class?

I'm new to python and everything, and I'm looking to parse all the href within a div class. My goal is to create a program to open up all the links within the div class to be able to save the photos associated with the href.
The link: https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer
The section I want to parse is "div-id: all_nail_lacquer"
So far I'm able to get all the href and this is what I have so far:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
nail_lacquer = (soup.find('div', {"id":"all_nail_lacquer"}))
"""
for nail_lacquer in soup.find_all('div'):
print(nail_lacquer.findAll('a')
"""
for a in soup.findAll('div', {"id":"all_nail_lacquer"}):
for b in a.findAll('a'):
print(b.get('href'))
To print image links (even hi-res image) and titles, you can use this script:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
print(img['data-src'])
print(img['data-src'].replace('shelf_image', 'photos')) # <-- this is URL to hi-res image
print(img['title'])
print('-' * 80)
Prints:
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
https://www.opi.com/sites/default/files/styles/product_photos/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
Baby, Take a Vow
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
https://www.opi.com/sites/default/files/styles/product_photos/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
Suzi Without a Paddle
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
https://www.opi.com/sites/default/files/styles/product_photos/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
Coconuts Over OPI
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
https://www.opi.com/sites/default/files/styles/product_photos/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
No Tan Lines
--------------------------------------------------------------------------------
...and so on.
EDIT: To save images to disk, you can use this script:
import requests
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = requests.get(theurl)
soup = BeautifulSoup(thepage.content, "html.parser")
i = 1
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
u = img['data-src'].replace('shelf_image', 'photos')
with open('img_{:04d}.jpg'.format(i), 'wb') as f_out:
print('Saving {}'.format(u))
f_out.write(requests.get(u).content)
i += 1

How to get HTML of the whole page without scrolling?

import requests
import urllib.request
from bs4 import BeautifulSoup
def get_photos(nick,how_many):
url = f"https://www.picuki.com/profile/{nick}"
content = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'}).content
soup = BeautifulSoup(content,"html.parser")
images = [f["src"] for f in soup.findAll('img',class_="post-image")]
for index, image in enumerate(images, start=1):
urllib.request.urlretrieve(image, f"/Users/user/PycharmProjects/untitled1/Instagram_images/image{index}.png")
if index == how_many: break
if __name__ == "__main__":
get_photos("Username",20)
So I have this code which downloads images in png format from instagram. But problem is that this page only loads 18 images without scrolling. So if I input 18-36 I need to scroll down page one more time, if 36-54 I need to scroll down 2 times and get it's HTML. How to do it with request and is it even possible with this module?
The images are loaded with Ajax, but you can emulate the Ajax with requests module.
This script will print all image URLs found on user profile:
import requests
from bs4 import BeautifulSoup
username = 'itsdougthepug'
base_url = 'https://www.picuki.com/profile/{username}'
def get_image_urls(username):
url = base_url.format(username=username)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
while True:
for f in soup.findAll('img',class_="post-image"):
yield f['src']
load_more_url = soup.select_one('.load-more-wrapper[data-next]')
if not load_more_url:
load_more_url = soup.select_one('.pagination-next-page-input[value]')
if load_more_url:
load_more_url = load_more_url['value']
else:
load_more_url = load_more_url['data-next']
if not load_more_url:
break
soup = BeautifulSoup(requests.get('https://www.picuki.com' + load_more_url).content, 'html.parser')
for img in get_image_urls(username):
print(img)
Prints:
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103328423_965950027183296_957866876806120724_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=100&_nc_ohc=sW8Ic2lI-4UAX_b7bkB&oh=dc42f3f625065b6fba524bd39fc29cb0&oe=5EE7819B
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103183716_3364797436946158_1962633742202963007_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=OjegUcacb2kAX_BGNBA&oh=92a8035ffed07e724a77617c6ff73b73&oe=5F0F1F22
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102951446_2650089068539996_1395066409287738000_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=zXDXxxtqYUkAX9_1jE3&oh=06e83257c7a2b1cfea593719a3af60d2&oe=5F0D3F32
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103290695_2721943028038123_664290938707092396_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=107&_nc_ohc=cZKGnM3wjBwAX9wsGvR&oh=132218410341a0ffc2d7d78f38904a01&oe=5F104353
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103207650_283928112789317_1081832932435688252_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=105&_nc_ohc=3XfsL50CwCoAX9k2_dN&oh=969bdf74e73466a39952957bfd8ec528&oe=5F0E2A91
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102546510_111827600395599_8198630171951588410_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=103&_nc_ohc=cVJqLrxo-fUAX9fBZtG&oh=8edcc8a5bf56519d0155e6d23ac514b3&oe=5F0EA104
... and so on.

Python going to new page by chagning url number

hi i need to make this python code loop page from 1 until 5
('https://test.com/index/1')
if their any way with selenium that will be cool <3
am new it python
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import requests
import json
import csv
import time
html = urlopen('https://test.com/index/1')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])
You can loop on range of numbers add use them as parameter
for i in range(1, 6):
html = urlopen(f'https://test.com/index/{i}')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src': re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])
It is mostly same as #Guy, but you can do it this way too.
for num in range(1, 6):
html = urlopen(f'https://test.com/index/%s'%num)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src': re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])

How to detect a strong tag and add a "*" to each?

I have this code in python, and what it does to me is to stretch from a web. The text content of the articles of the web, and save them in different files. I would like to know, how to detect a strong tag and in each one of them add a " " before or after.
This is the result that I need:
import urllib2
import re
from bs4 import BeautifulSoup
import time
def _remove_attrs(soup):
for tag in soup.findAll(True):
href=''
if (tag.has_attr('href')):
href=tag.get('href')
src=''
if (tag.has_attr('src')):
src=tag.get('src')
# tag.attrs = None
tag.attrs = {}
if (href!=''):
tag['href']= href
if (src!=''):
tag['src']= src
return soup
def _remove_empty(soup):
return soup
for x in soup.find_all():
if len(x.text) == 0:
x.extract()
return soup
base_url= 'http://www.scavonehnos.com.py/index.php?
mact=Vmcs,cntnt01,print,0&cntnt01articleid='
for x in range(10,12):
n_url=base_url + str(x)
print ("#PAGINA: "+n_url)
page = urllib2.urlopen(n_url)
soup = BeautifulSoup(page, 'html.parser')
contenido=(soup.div.get_text())
file = open('vicentec/prod_'+str(x)+'.txt', 'w')
file.write(u' '.strip(contenido).join((contenido)).encode('utf-
8'))
file.close()
time.sleep(5)
As you will see I want to add the asterisk to the <strong> tag on the web.
For those who visited this question this case I already solved it and it stayed and it works perfectly
import urllib2
import re
from bs4 import BeautifulSoup
import time
def _remove_attrs(soup):
for tag in soup.findAll(True):
href=''
if (tag.has_attr('href')):
href=tag.get('href')
src=''
if (tag.has_attr('src')):
src=tag.get('src')
# tag.attrs = None
tag.attrs = {}
if (href!=''):
tag['href']= href
if (src!=''):
tag['src']= src
return soup
def _remove_empty(soup):
return soup
for x in soup.find_all(''):
if len(x.text) == 0:
x.extract()
return soup
base_url= 'http://www.scavonehnos.com.py/index.php?mact=Vmcs,cntnt01,print,0&cntnt01articleid='
for x in range(10,225):
n_url=base_url + str(x)
print ("#PAGINA: "+n_url)
page = urllib2.urlopen(n_url)
soup = BeautifulSoup(page, 'html.parser')
for strong in soup.select('strong'):
strong.replace_with('#'+strong.get_text())
contenido=(soup.div.get_text())
fprod = 'vicentec/prod_'+(str(x))+'.txt'
file = open(fprod, "w")
file.write(u' '.strip(contenido).join((contenido)).encode('utf-8'))
file.close()

Having difficulties with my python script

I am new to Python and can use a little help. I am trying to write a script that will go out to a specific web site and download multiple .gif images in different spots at that site. Can anyone assist me in the right direction to take. This is the first one I have tried to make.
Here is what i got so far.
from http:// import http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/ as bs
import urlparse
from urllib2 import urlopen
from urllib import urlretrieve
import os
import sys
def main(url, out_folder="C:\Users\jerry\Desktop\Heli/"):
"""Downloads all the images at 'url' to /test/"""
http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/ = bs(urlopen(url))
parsed = list(urlparse.urlparse(url))
for image in http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/.findAll("gif"):
print "gif: %(src)s" % image
filename = gif["src"].split("/")[-1]
parsed[2] = gif["src"]
outpath = os.path.join(out_folder, filename)
if gif["src"].lower().startswith("http"):
urlretrieve(gif["src"], outpath)
else:
urlretrieve(urlparse.urlunparse(parsed), outpath)
def _usage():
print "usage: python dumpimages.py http:folkworm.ceri.memphis.edu/heli/heli_bb_ag/ [outpath]"
if __name__ == "__main__":
url = sys.argv[-1]
out_folder = "/test/"
if not url.lower().startswith("http"):
out_folder = sys.argv[-1]
url = sys.argv[-2]
if not url.lower().startswith("http"):
_usage()
sys.exit(-1)
main(url, out_folder)
Here is the basic idea.
>>> import requests
>>> from bs4 import BeautifulSoup
>>> item = requests.get('http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/')
>>> page = item.text
>>> soup = BeautifulSoup(page, 'lxml')
>>> links = soup.findAll('a')
>>> for link in links:
... if '.gif' in link.attrs['href']:
... print (link.attrs['href'])
... break
...
CCAR_HHZ_AG_00.2017012700.gif?v=1485534942
The break statement is there just to interrupt the script so that it doesn't print all of the names of the gif's. The next step would be to add code to that loop to concatenate the URL mentioned in the requests.get to the name of each gif and do a requests.get for it. This time though you would do, say, image = item.content to get the image in bytes, which you could write to a file of your choice.
EDIT: Fleshed out. Note you still need to arrange to provide one file name for each output file.
>>> import requests
>>> from bs4 import BeautifulSoup
>>> URL = 'http://folkworm.ceri.memphis.edu/heli/heli_bb_ag/'
>>> item = requests.get(URL)
>>> page = item.text
>>> soup = BeautifulSoup(page, 'lxml')
>>> links = soup.findAll('a')
>>> for link in links:
... if '.gif' in link.attrs['href']:
... print (link.attrs['href'])
... pic = requests.get(URL + link.attrs['href'])
... image = pic.content
... open('pic.gif', 'wb').write(image)
... break
...
CCAR_HHZ_AG_00.2017012700.gif?v=1485535857
100846

Categories

Resources