Python going to new page by chagning url number - python

hi i need to make this python code loop page from 1 until 5
('https://test.com/index/1')
if their any way with selenium that will be cool <3
am new it python
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import requests
import json
import csv
import time
html = urlopen('https://test.com/index/1')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src':re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])

You can loop on range of numbers add use them as parameter
for i in range(1, 6):
html = urlopen(f'https://test.com/index/{i}')
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src': re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])

It is mostly same as #Guy, but you can do it this way too.
for num in range(1, 6):
html = urlopen(f'https://test.com/index/%s'%num)
bs = BeautifulSoup(html, 'html.parser')
images = bs.find_all('img', {'src': re.compile('.jpg')})
for image in images:
print(image['src'].split('/')[-1].split('_')[0])

Related

Extracting integer from the paragraph

i am trying to extract only the fees amount from the paragraph but i am facing the problem. there is two fees amount and i want 2 of them. this is my code for: http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx
fees_div = soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent')
if fees_div:
fees_list = fees_div.find_all('\d+','p')
course_data['Fees'] = fees_list
print('fees : ', fees_list)
Try this please:
In [10]: import requests
In [11]: from bs4 import *
In [12]: page = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
In [13]: soup = BeautifulSoup(page.content, 'html.parser')
In [14]: [x for x in soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent').text.split() if u"\xA3" in x]
Out[14]: ['£9,250*', '£17,320']
Give this a shot:
import re
import requests
from bs4 import BeautifulSoup
r = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
soup = BeautifulSoup(r.text,'html.parser')
item = soup.find(id='Panel5').text
fees = re.findall(r"students:[^£]+(.*?)[*\s]",item)
print(fees)
Output:
['£9,250', '£17,320']
import requests
from bs4 import BeautifulSoup
import re
r = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
soup = BeautifulSoup(r.text, 'html.parser')
fees_div = soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent')
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(2)')[0].get_text())
fee1 = m[0]
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(3)')[0].get_text())
fee2 = m[0]
print(fee1, fee2)
Prints:
£9,250 £17,320
Update
You can also scrape the page using Selenium, although in this case it offers no advantage. For example (using Chhrome):
from selenium import webdriver
from bs4 import BeautifulSoup
import re
options = webdriver.ChromeOptions()
options.add_argument("headless")
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)
driver.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
soup = BeautifulSoup(driver.page_source, 'html.parser')
fees_div = soup.find('div', class_='Fees hiddenContent pad-around-large tabcontent')
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(2)')[0].get_text())
fee1 = m[0]
m = re.search(r'£[\d,]+', fees_div.select('p:nth-of-type(3)')[0].get_text())
fee2 = m[0]
print(fee1, fee2)
driver.quit()
Update
Consider just using the following: Just scan the entire HTML source without using BeautifulSoup looking for the fees using a simple regular expression findall:
import requests
import re
r = requests.get('http://www.reading.ac.uk/ready-to-study/study/subject-area/modern-languages-and-european-studies-ug/ba-spanish-and-history.aspx')
print(re.findall(r'£[\d,]+', r.text))
Prints:
['£9,250', '£17,320']

How to open all href within a div class?

I'm new to python and everything, and I'm looking to parse all the href within a div class. My goal is to create a program to open up all the links within the div class to be able to save the photos associated with the href.
The link: https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer
The section I want to parse is "div-id: all_nail_lacquer"
So far I'm able to get all the href and this is what I have so far:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
print(soup.title.text)
nail_lacquer = (soup.find('div', {"id":"all_nail_lacquer"}))
"""
for nail_lacquer in soup.find_all('div'):
print(nail_lacquer.findAll('a')
"""
for a in soup.findAll('div', {"id":"all_nail_lacquer"}):
for b in a.findAll('a'):
print(b.get('href'))
To print image links (even hi-res image) and titles, you can use this script:
import urllib
import urllib.request
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = urllib.request.urlopen(theurl)
soup = BeautifulSoup(thepage, "html.parser")
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
print(img['data-src'])
print(img['data-src'].replace('shelf_image', 'photos')) # <-- this is URL to hi-res image
print(img['title'])
print('-' * 80)
Prints:
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
https://www.opi.com/sites/default/files/styles/product_photos/public/baby-take-a-vow-nlsh1-nail-lacquer-22850011001_0_0.jpg?itok=3b2ftHzc
Baby, Take a Vow
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
https://www.opi.com/sites/default/files/styles/product_photos/public/suzi-without-a-paddle-nlf88-nail-lacquer-22006698188_21_0.jpg?itok=mgi1-rz3
Suzi Without a Paddle
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
https://www.opi.com/sites/default/files/styles/product_photos/public/coconuts-over-opi-nlf89-nail-lacquer-22006698189_24_1_0.jpg?itok=yasOZA4l
Coconuts Over OPI
--------------------------------------------------------------------------------
https://www.opi.com/sites/default/files/styles/product_shelf_image/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
https://www.opi.com/sites/default/files/styles/product_photos/public/no-tan-lines-nlf90-nail-lacquer-22006698190_20_1_0.jpg?itok=ot_cu8c5
No Tan Lines
--------------------------------------------------------------------------------
...and so on.
EDIT: To save images to disk, you can use this script:
import requests
from bs4 import BeautifulSoup
theurl = "https://www.opi.com/shop-products/nail-polish-powders/nail-lacquer"
thepage = requests.get(theurl)
soup = BeautifulSoup(thepage.content, "html.parser")
i = 1
for img in soup.select('#all_nail_lacquer [typeof="foaf:Image"][data-src]'):
u = img['data-src'].replace('shelf_image', 'photos')
with open('img_{:04d}.jpg'.format(i), 'wb') as f_out:
print('Saving {}'.format(u))
f_out.write(requests.get(u).content)
i += 1

How to get HTML of the whole page without scrolling?

import requests
import urllib.request
from bs4 import BeautifulSoup
def get_photos(nick,how_many):
url = f"https://www.picuki.com/profile/{nick}"
content = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'}).content
soup = BeautifulSoup(content,"html.parser")
images = [f["src"] for f in soup.findAll('img',class_="post-image")]
for index, image in enumerate(images, start=1):
urllib.request.urlretrieve(image, f"/Users/user/PycharmProjects/untitled1/Instagram_images/image{index}.png")
if index == how_many: break
if __name__ == "__main__":
get_photos("Username",20)
So I have this code which downloads images in png format from instagram. But problem is that this page only loads 18 images without scrolling. So if I input 18-36 I need to scroll down page one more time, if 36-54 I need to scroll down 2 times and get it's HTML. How to do it with request and is it even possible with this module?
The images are loaded with Ajax, but you can emulate the Ajax with requests module.
This script will print all image URLs found on user profile:
import requests
from bs4 import BeautifulSoup
username = 'itsdougthepug'
base_url = 'https://www.picuki.com/profile/{username}'
def get_image_urls(username):
url = base_url.format(username=username)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
while True:
for f in soup.findAll('img',class_="post-image"):
yield f['src']
load_more_url = soup.select_one('.load-more-wrapper[data-next]')
if not load_more_url:
load_more_url = soup.select_one('.pagination-next-page-input[value]')
if load_more_url:
load_more_url = load_more_url['value']
else:
load_more_url = load_more_url['data-next']
if not load_more_url:
break
soup = BeautifulSoup(requests.get('https://www.picuki.com' + load_more_url).content, 'html.parser')
for img in get_image_urls(username):
print(img)
Prints:
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103328423_965950027183296_957866876806120724_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=100&_nc_ohc=sW8Ic2lI-4UAX_b7bkB&oh=dc42f3f625065b6fba524bd39fc29cb0&oe=5EE7819B
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103183716_3364797436946158_1962633742202963007_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=OjegUcacb2kAX_BGNBA&oh=92a8035ffed07e724a77617c6ff73b73&oe=5F0F1F22
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102951446_2650089068539996_1395066409287738000_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=zXDXxxtqYUkAX9_1jE3&oh=06e83257c7a2b1cfea593719a3af60d2&oe=5F0D3F32
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103290695_2721943028038123_664290938707092396_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=107&_nc_ohc=cZKGnM3wjBwAX9wsGvR&oh=132218410341a0ffc2d7d78f38904a01&oe=5F104353
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103207650_283928112789317_1081832932435688252_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=105&_nc_ohc=3XfsL50CwCoAX9k2_dN&oh=969bdf74e73466a39952957bfd8ec528&oe=5F0E2A91
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102546510_111827600395599_8198630171951588410_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=103&_nc_ohc=cVJqLrxo-fUAX9fBZtG&oh=8edcc8a5bf56519d0155e6d23ac514b3&oe=5F0EA104
... and so on.

Newbie, scraping table from url and not able to get output in python command prompt

This is a homework assignment, I have to sum up all the data in the span tag and print it, I took out all the info in the span tag and appended it to a list, I dont know how to go beyond that as any funtion I type out of the for loop does not function, also I have to hit enter twice after I run this in the python command prompt to get an output
I am new here, please forgive the format of the question, thanks for the help
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
for span in spans:
num = int(span.text)
lst.append(num)
print(num)
No need to collect to list or anything if you just adding them together. You can do it like this:
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
sum = 0
for span in spans:
sum += int(span.text)
print("Total Sum = " + str(sum))
As you have prepared the list of numbers in span, you can have the sum of numbers using sum() function in python. Pass your list as argument of sum().
import urllib.request, urllib.parse, urllib.error
lst = list()
from bs4 import BeautifulSoup
url = 'http://py4e-data.dr-chuck.net/comments_42.html'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
spans = soup.findAll('span', attrs = {'class' : 'comments'})
for span in spans:
num = int(span.text)
lst.append(num)
print(num)
sum_of_nums = sum(lst)
print(sum_of_nums)

Image src is not completely scraped.(../../../images/stat/popolazione.jpg)

I am using the following code and I am unable to get complete src of image.
from requests import get
from bs4 import BeautifulSoup
import re
def statistics(url):
resp = get(url)
soup = BeautifulSoup(resp.text, "lxml")
reqTable = soup.find('table', class_ = 'tabwrap')
images = reqTable.findAll('img')
for img in images:
print(img['src'])
statistics('http://www.comuni-italiani.it/084/001/statistiche/')
output:
This is image src i want to scrape:
Link to website: http://www.comuni-italiani.it/084/001/statistiche/
I want to get this Popolazione number out of the image src so that's why I need that src so i can split and get that number 59.605
Try the following to get the numbers available in those image links:
import requests
from bs4 import BeautifulSoup
def get_image_links(url):
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "lxml")
for img in soup.select('table.tabwrap img[src*="d_fnote_title"]'):
item_num = img['src'].split("h|")[1].split("|")[0]
print(item_num)
if __name__ == '__main__':
link = 'http://www.comuni-italiani.it/084/001/statistiche/'
get_image_links(link)

Categories

Resources