How to open all href within a div class? - python

I'm new to python and everything, and I'm looking to parse all the href within a div class. My goal is to create a program to open up all the links within the div class to be able to save the photos associated with the href.
The link: https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer
The section I want to parse is "div-id: all_nail_lacquer"
So far I'm able to get all the href and this is what I have so far:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
nail_lacquer = (soup.find('div', {"id":"all_nail_lacquer"}))
"""
for nail_lacquer in soup.find_all('div'):
print(nail_lacquer.findAll('a')
"""
for a in soup.findAll('div', {"id":"all_nail_lacquer"}):
for b in a.findAll('a'):
print(b.get('href'))

To print image links (even hi-res image) and titles, you can use this script:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
print(img['data-src'])
print(img['data-src'].replace('shelf_image', 'photos')) # <-- this is URL to hi-res image
print(img['title'])
print('-' * 80)
Prints:
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
https://www.opi.com/sites/default/files/styles/product_photos/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
Baby, Take a Vow
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
https://www.opi.com/sites/default/files/styles/product_photos/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
Suzi Without a Paddle
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
https://www.opi.com/sites/default/files/styles/product_photos/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
Coconuts Over OPI
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
https://www.opi.com/sites/default/files/styles/product_photos/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
No Tan Lines
--------------------------------------------------------------------------------
...and so on.
EDIT: To save images to disk, you can use this script:
import requests
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = requests.get(theurl)
soup = BeautifulSoup(thepage.content, "html.parser")
i = 1
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
u = img['data-src'].replace('shelf_image', 'photos')
with open('img_{:04d}.jpg'.format(i), 'wb') as f_out:
print('Saving {}'.format(u))
f_out.write(requests.get(u).content)
i += 1

Related

How to get HTML of the whole page without scrolling?

import requests
import urllib.request
from bs4 import BeautifulSoup
def get_photos(nick,how_many):
url = f"https://www.picuki.com/profile/{nick}"
content = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'}).content
soup = BeautifulSoup(content,"html.parser")
images = [f["src"] for f in soup.findAll('img',class_="post-image")]
for index, image in enumerate(images, start=1):
urllib.request.urlretrieve(image, f"/Users/user/PycharmProjects/untitled1/Instagram_images/image{index}.png")
if index == how_many: break
if __name__ == "__main__":
get_photos("Username",20)
So I have this code which downloads images in png format from instagram. But problem is that this page only loads 18 images without scrolling. So if I input 18-36 I need to scroll down page one more time, if 36-54 I need to scroll down 2 times and get it's HTML. How to do it with request and is it even possible with this module?
The images are loaded with Ajax, but you can emulate the Ajax with requests module.
This script will print all image URLs found on user profile:
import requests
from bs4 import BeautifulSoup
username = 'itsdougthepug'
base_url = 'https://www.picuki.com/profile/{username}'
def get_image_urls(username):
url = base_url.format(username=username)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
while True:
for f in soup.findAll('img',class_="post-image"):
yield f['src']
load_more_url = soup.select_one('.load-more-wrapper[data-next]')
if not load_more_url:
load_more_url = soup.select_one('.pagination-next-page-input[value]')
if load_more_url:
load_more_url = load_more_url['value']
else:
load_more_url = load_more_url['data-next']
if not load_more_url:
break
soup = BeautifulSoup(requests.get('https://www.picuki.com' + load_more_url).content, 'html.parser')
for img in get_image_urls(username):
print(img)
Prints:
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103328423_965950027183296_957866876806120724_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=100&_nc_ohc=sW8Ic2lI-4UAX_b7bkB&oh=dc42f3f625065b6fba524bd39fc29cb0&oe=5EE7819B
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103183716_3364797436946158_1962633742202963007_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=OjegUcacb2kAX_BGNBA&oh=92a8035ffed07e724a77617c6ff73b73&oe=5F0F1F22
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102951446_2650089068539996_1395066409287738000_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=zXDXxxtqYUkAX9_1jE3&oh=06e83257c7a2b1cfea593719a3af60d2&oe=5F0D3F32
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103290695_2721943028038123_664290938707092396_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=107&_nc_ohc=cZKGnM3wjBwAX9wsGvR&oh=132218410341a0ffc2d7d78f38904a01&oe=5F104353
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103207650_283928112789317_1081832932435688252_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=105&_nc_ohc=3XfsL50CwCoAX9k2_dN&oh=969bdf74e73466a39952957bfd8ec528&oe=5F0E2A91
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102546510_111827600395599_8198630171951588410_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=103&_nc_ohc=cVJqLrxo-fUAX9fBZtG&oh=8edcc8a5bf56519d0155e6d23ac514b3&oe=5F0EA104
... and so on.

Image src is not completely scraped.(../../../images/stat/popolazione.jpg)

I am using the following code and I am unable to get complete src of image.
from requests import get
from bs4 import BeautifulSoup
import re
def statistics(url):
resp = get(url)
soup = BeautifulSoup(resp.text, "lxml")
reqTable = soup.find('table', class_ = 'tabwrap')
images = reqTable.findAll('img')
for img in images:
print(img['src'])
statistics('http://www.comuni-italiani.it/084/001/statistiche/')
output:
This is image src i want to scrape:
Link to website: http://www.comuni-italiani.it/084/001/statistiche/
I want to get this Popolazione number out of the image src so that's why I need that src so i can split and get that number 59.605
Try the following to get the numbers available in those image links:
import requests
from bs4 import BeautifulSoup
def get_image_links(url):
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "lxml")
for img in soup.select('table.tabwrap img[src*="d_fnote_title"]'):
item_num = img['src'].split("h|")[1].split("|")[0]
print(item_num)
if __name__ == '__main__':
link = 'http://www.comuni-italiani.it/084/001/statistiche/'
get_image_links(link)

Python going to new page by chagning url number

hi i need to make this python code loop page from 1 until 5
('https://test.com/index/1')
if their any way with selenium that will be cool <3
am new it python
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import requests
import json
import csv
import time
html = urlopen('https://test.com/index/1')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])
You can loop on range of numbers add use them as parameter
for i in range(1, 6):
html = urlopen(f'https://test.com/index/{i}')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src': re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])
It is mostly same as #Guy, but you can do it this way too.
for num in range(1, 6):
html = urlopen(f'https://test.com/index/%s'%num)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src': re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])

How to Get Text between Span Tags XPATH Python

I am working with this website https://www.pealim.com/dict/?page=1. So I basically want to get the hebrew word and its pronunciation.
Below is my code and it loops through all the td tags however, it produces the exact same output which is the following {'latin': 'av', 'hebrew': u'\u05d0\u05b8\u05d1'} And this code is for only page=1. I would love to know if there is any automated way to loop through every page.
import requests
from lxml import etree
resp = requests.get("https://www.pealim.com/dict/?page=1")
htmlparser = etree.HTMLParser()
tree = etree.fromstring(resp.text, htmlparser)
for td in tree.xpath('//*//table[#class="table table-hover dict-table-t"]/tbody/tr'):
print(td)
data = {
'hebrew': td.xpath('string(//span[#class="menukad"])'),
'latin': td.xpath('string(//span[#class="dict-transcription"])'),
}
print(data)
I would like to collect information for every single entry in that website. Please let me know what I can do to achieve this.
import requests
from bs4 import BeautifulSoup
from pprint import pprint
for i in range(1, 411):
data = []
resp = requests.get("https://www.pealim.com/dict/?page={}".format(i))
soup = BeautifulSoup(resp.text, 'lxml')
for m, t in zip(soup.select('.menukad'), soup.select('.dict-transcription')):
data.append((m.text, t.text))
print('PAGE {}'.format(i))
print('*' * 80)
pprint(data)
Prints:
PAGE 1
********************************************************************************
[('אָב', 'av'),
('אַבָּא', 'aba'),
('אָבִיב', 'aviv'),
('אֵב', 'ev'),
('לֶאֱבוֹד', "le'evod"),
('לְהֵיאָבֵד', "lehe'aved"),
('לְאַבֵּד', "le'abed"),
('לְהִתְאַבֵּד', "lehit'abed"),
('לְהַאֲבִיד', "leha'avid"),
('הִתְאַבְּדוּת', "hit'abdut"),
('אִיבּוּד', 'ibud'),
('אֲבֵדָה', 'aveda'),
('אָבוּד', 'avud'),
('לְאַבְחֵן', "le'avchen"),
('אִיבְחוּן', 'ivchun')]
PAGE 2
********************************************************************************
[('לְאַבְטֵחַ', "le'avteach"),
('אִיבְטוּחַ', 'ivtuach'),
('אֲבַטִּיחַ', 'avatiach'),
('לֶאֱבוֹת', "le'evot"),
('אֵבֶל', 'evel'),
('לֶאֱבוֹל', "le'evol"),
('אֲבָל', 'aval'),
('לְהִתְאַבֵּל', "lehit'abel"),
('לְהִתְאַבֵּן', "lehit'aben"),
('אֶבֶן', 'even'),
('לְהַאֲבִיס', "leha'avis"),
('לְהֵיאָבֵק', "lehe'avek"),
('מַאֲבָק', "ma'avak"),
('לְאַבֵּק', "le'abek"),
('אָבָק', 'avak')]
PAGE 3
********************************************************************************
[('לְהִתְאַבֵּק', "lehit'abek"),
('לְהִתְאַבֵּק', "lehit'abek"),
('מְאוּבָּק', "me'ubak"),
('אִיבּוּק', 'ibuk'),
...and so on.
Andrej beat me to it, but alternatively you can use .find() and .get_text() methods of BeautifulSoup:
import bs4
import requests
for page_number in range(1, 411):
print("-" * 35, page_number, "-" * 35)
resp = requests.get("https://www.pealim.com/dict/?page={}".format(page_number))
soup = bs4.BeautifulSoup(resp.text, "html.parser")
table_elem = soup.find("tbody")
rows = table_elem.find_all("tr")
for row in rows:
hebrew = row.find("span", class_="menukad").get_text()
latin = row.find("span", class_="dict-transcription").get_text()
print("{}: {}".format(hebrew, latin))
To yield essentially the same result.

How to detect a strong tag and add a "*" to each?

I have this code in python, and what it does to me is to stretch from a web. The text content of the articles of the web, and save them in different files. I would like to know, how to detect a strong tag and in each one of them add a " " before or after.
This is the result that I need:
import urllib2
import re
from bs4 import BeautifulSoup
import time
def _remove_attrs(soup):
for tag in soup.findAll(True):
href=''
if (tag.has_attr('href')):
href=tag.get('href')
src=''
if (tag.has_attr('src')):
src=tag.get('src')
# tag.attrs = None
tag.attrs = {}
if (href!=''):
tag['href']= href
if (src!=''):
tag['src']= src
return soup
def _remove_empty(soup):
return soup
for x in soup.find_all():
if len(x.text) == 0:
x.extract()
return soup
base_url= 'http://www.scavonehnos.com.py/index.php?
mact=Vmcs,cntnt01,print,0&cntnt01articleid='
for x in range(10,12):
n_url=base_url + str(x)
print ("#PAGINA: "+n_url)
page = urllib2.urlopen(n_url)
soup = BeautifulSoup(page, 'html.parser')
contenido=(soup.div.get_text())
file = open('vicentec/prod_'+str(x)+'.txt', 'w')
file.write(u' '.strip(contenido).join((contenido)).encode('utf-
8'))
file.close()
time.sleep(5)
As you will see I want to add the asterisk to the <strong> tag on the web.
For those who visited this question this case I already solved it and it stayed and it works perfectly
import urllib2
import re
from bs4 import BeautifulSoup
import time
def _remove_attrs(soup):
for tag in soup.findAll(True):
href=''
if (tag.has_attr('href')):
href=tag.get('href')
src=''
if (tag.has_attr('src')):
src=tag.get('src')
# tag.attrs = None
tag.attrs = {}
if (href!=''):
tag['href']= href
if (src!=''):
tag['src']= src
return soup
def _remove_empty(soup):
return soup
for x in soup.find_all(''):
if len(x.text) == 0:
x.extract()
return soup
base_url= 'http://www.scavonehnos.com.py/index.php?mact=Vmcs,cntnt01,print,0&cntnt01articleid='
for x in range(10,225):
n_url=base_url + str(x)
print ("#PAGINA: "+n_url)
page = urllib2.urlopen(n_url)
soup = BeautifulSoup(page, 'html.parser')
for strong in soup.select('strong'):
strong.replace_with('#'+strong.get_text())
contenido=(soup.div.get_text())
fprod = 'vicentec/prod_'+(str(x))+'.txt'
file = open(fprod, "w")
file.write(u' '.strip(contenido).join((contenido)).encode('utf-8'))
file.close()

Categories

Resources