How to get HTML of the whole page without scrolling? - python

import requests
import urllib.request
from bs4 import BeautifulSoup
def get_photos(nick,how_many):
url = f"https://www.picuki.com/profile/{nick}"
content = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'}).content
soup = BeautifulSoup(content,"html.parser")
images = [f["src"] for f in soup.findAll('img',class_="post-image")]
for index, image in enumerate(images, start=1):
urllib.request.urlretrieve(image, f"/Users/user/PycharmProjects/untitled1/Instagram_images/image{index}.png")
if index == how_many: break
if __name__ == "__main__":
get_photos("Username",20)
So I have this code which downloads images in png format from instagram. But problem is that this page only loads 18 images without scrolling. So if I input 18-36 I need to scroll down page one more time, if 36-54 I need to scroll down 2 times and get it's HTML. How to do it with request and is it even possible with this module?

The images are loaded with Ajax, but you can emulate the Ajax with requests module.
This script will print all image URLs found on user profile:
import requests
from bs4 import BeautifulSoup
username = 'itsdougthepug'
base_url = 'https://www.picuki.com/profile/{username}'
def get_image_urls(username):
url = base_url.format(username=username)
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
while True:
for f in soup.findAll('img',class_="post-image"):
yield f['src']
load_more_url = soup.select_one('.load-more-wrapper[data-next]')
if not load_more_url:
load_more_url = soup.select_one('.pagination-next-page-input[value]')
if load_more_url:
load_more_url = load_more_url['value']
else:
load_more_url = load_more_url['data-next']
if not load_more_url:
break
soup = BeautifulSoup(requests.get('https://www.picuki.com' + load_more_url).content, 'html.parser')
for img in get_image_urls(username):
print(img)
Prints:
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103328423_965950027183296_957866876806120724_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=100&_nc_ohc=sW8Ic2lI-4UAX_b7bkB&oh=dc42f3f625065b6fba524bd39fc29cb0&oe=5EE7819B
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103183716_3364797436946158_1962633742202963007_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=OjegUcacb2kAX_BGNBA&oh=92a8035ffed07e724a77617c6ff73b73&oe=5F0F1F22
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102951446_2650089068539996_1395066409287738000_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=1&_nc_ohc=zXDXxxtqYUkAX9_1jE3&oh=06e83257c7a2b1cfea593719a3af60d2&oe=5F0D3F32
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103290695_2721943028038123_664290938707092396_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=107&_nc_ohc=cZKGnM3wjBwAX9wsGvR&oh=132218410341a0ffc2d7d78f38904a01&oe=5F104353
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/p640x640/103207650_283928112789317_1081832932435688252_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=105&_nc_ohc=3XfsL50CwCoAX9k2_dN&oh=969bdf74e73466a39952957bfd8ec528&oe=5F0E2A91
https://scontent-sin6-2.cdninstagram.com/v/t51.2885-15/sh0.08/e35/s640x640/102546510_111827600395599_8198630171951588410_n.jpg?_nc_ht=scontent-sin6-2.cdninstagram.com&_nc_cat=103&_nc_ohc=cVJqLrxo-fUAX9fBZtG&oh=8edcc8a5bf56519d0155e6d23ac514b3&oe=5F0EA104
... and so on.

Related

Image src is not completely scraped.(../../../images/stat/popolazione.jpg)

I am using the following code and I am unable to get complete src of image.
from requests import get
from bs4 import BeautifulSoup
import re
def statistics(url):
resp = get(url)
soup = BeautifulSoup(resp.text, "lxml")
reqTable = soup.find('table', class_ = 'tabwrap')
images = reqTable.findAll('img')
for img in images:
print(img['src'])
statistics('http://www.comuni-italiani.it/084/001/statistiche/')
output:
This is image src i want to scrape:
Link to website: http://www.comuni-italiani.it/084/001/statistiche/
I want to get this Popolazione number out of the image src so that's why I need that src so i can split and get that number 59.605
Try the following to get the numbers available in those image links:
import requests
from bs4 import BeautifulSoup
def get_image_links(url):
resp = requests.get(url)
soup = BeautifulSoup(resp.text, "lxml")
for img in soup.select('table.tabwrap img[src*="d_fnote_title"]'):
item_num = img['src'].split("h|")[1].split("|")[0]
print(item_num)
if __name__ == '__main__':
link = 'http://www.comuni-italiani.it/084/001/statistiche/'
get_image_links(link)

Scrape all images of off a multiple pages site?

I need to scrape all the images of the pages of the url given in the code but i could only do it manually each page till the last page(100th page).
This is the code for scraping each page and i replace the page number each time and run the code!
Down below
Is there any way to add a variable function and running a loop till it gets an error in this case a 404 page (since no more pages would be left)?
from bs4 import*
import requests as rq
r2 = rq.get("https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page=1&phrase=aishwarya%20rai&sort=mostpopular")
soup2 = BeautifulSoup(r2.text, "html.parser")
links = []
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]') #the frame where it shows the images
for img in x:
links.append(img['src'])
for index, img_link in enumerate(links):
img_data = rq.get(img_link).content
with open("aishwarya_rai/"+str(index+2)+'.jpg', 'wb+') as f:
f.write(img_data)
else:
f.close()
The page ranges from 1 to 100.
I need some additional code which makes the "page value" a variable and loops till 100
Use format() function and pass the page variable.
from bs4 import*
import requests as rq
url="https://www.gettyimages.in/photos/aishwarya-rai?family=editorial&page={}&phrase=aishwarya%20rai&sort=mostpopular"
links = []
for page in range(1,101):
print(url.format(page))
r2 = rq.get(url.format(page))
soup2 = BeautifulSoup(r2.text, "html.parser")
x = soup2.select('img[src^="https://media.gettyimages.com/photos/"]')
for img in x:
links.append(img['src'])
print(links)

I am using BeautifulSoup, how can I get the link after the redirect?

I want to get the link after the redirect of the download link in the article page.
For example:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/
In the above article page, there are the following download links:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=
Open this link directly, it will not redirect to the real download link, you need to open it in the article page.
# coding=utf-8
import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
def urlopen(url):
'''
using requests to replace urllib.requests.urlopen
return an html
'''
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
return r.text
def generate_pages(subTitle,fromPage,toPage):
'''
return page sites' url list
'''
pages = []
if(fromPage > 0 and fromPage<toPage):
for i in range(fromPage,toPage+1):
pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
return pages
def get_book_sites_of_one_page(page):
'''
get book site's url in one page
input: page site url
output: book site urls list
return book sites in one page
'''
html = urlopen(page)
soup = BeautifulSoup(html,'html.parser')
linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
bookSites= []
for link in linkList[::2]:
if 'href' in link.attrs:
#print(link)
bookSites.append(link.attrs['href'])
return bookSites
def get_book_urls(bookSite):
'''
input a book site
find book downloading urls in this book site
then
return them as a list
'''
bookURLs=[]
html = urlopen(bookSite)
soup = BeautifulSoup(html,'lxml')
linkList = soup.findAll("a",{"target":"_blank"})
for link in linkList[::2]:
# print(link)
if 'href' in link.attrs:
bookURLs.append(link.attrs['href'])
return bookURLs
def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
bookSites = []
bookURLs = []
pages = generate_pages(subTitle,fromPage, toPage)
for page in pages:
bookSiteOfOnePage=get_book_sites_of_one_page(page)
bookSites.extend(bookSiteOfOnePage)
for bookSite in bookSites:
book_urls=get_book_urls(bookSite)
bookURLs += book_urls
for bookURL in bookURLs:
print(bookURL)
#with open(filename, 'w') as f:
# f.write(bookURLs)
def main():
if(len(sys.argv) == 4):
'''
python getUrl.py 1, 100, programming
from page 1 to page in subject programming
'''
subTitle = str(sys.argv[3])
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
get_all_book_urls(fromPage, toPage, subTitle)
if(len(sys.argv) == 3):
'''
python getUrl.py 1 100
from page 1 to page 100
'''
subTitle = ''
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
#filename = subTitle="-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv) == 2):
'''
python getUrl.py 10
from page 10 to page 10
only download books on page 10
'''
fromPage = int(sys.argv[1])
toPage = fromPage + 1
subTitle = ''
#filename = "All-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv)== 1):
fromPage = 1
# custom page range
toPage = 2
subTitle = ''
#filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
else:
print("Error, too many arguments")
if __name__ == '__main__':
#filename = ''
main()
Thank you for your help!
This website checks if the referer is set while redirecting. You can just give the original url as referer in the header and easily bypass this. You can also see that the referer is used as a url parameter in the final download link.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url)
Output
https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/

Python expanding YouTube uploads

So, I'm working on a little program that will automatically check and download new music from a bunch of given YouTube channels. I'm currently working on a way to obtain the links of all the uploaded videos each channel has, which I'm doing like a scraper. (Yes, the YouTube API would probably be the proper way to go, but I don't know how to properly use it yet.)
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import urllib.request
ytlink = 'https://www.youtube.com/channel/UCUvoulvwzCnUVk7yoduI_Gw/videos'
r = urllib.request.urlopen(ytlink).read()
soup = BeautifulSoup(r, "html.parser")
links = soup.find_all('a', {"class": "yt-uix-sessionlink yt-uix-tile-link spf-link yt-ui-ellipsis yt-ui-ellipsis-2"})
for tag in links:
link = tag.get('href', None)
if link is not None:
print(link)
This is what I currently have, the problem is, it currently only grabs the first 30 video links since those are the only ones on screen. I've already seen that when the "Load More" button is pressed it executes some Ajax which is initiated by some JavaScript. My question is: How can I get Python to keep triggering the "Load More" button until all uploads are visible?
You can easily mimic the ajax calls and parse the json output returned, we just need to pull the /browse_ajax?action_continuation=... url and keep requesting until it is no longer in the json returned:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin # python 3 -> from urllib.parse import urljoin
def get_links():
# cretate all css selectors
ytlink = 'https://www.youtube.com/channel/UCUvoulvwzCnUVk7yoduI_Gw/videos'
ajax_css = "button[data-uix-load-more-href]"
link_css = "a.yt-uix-sessionlink.yt-uix-tile-link.spf-link.yt-ui-ellipsis.yt-ui-ellipsis-2"
base = "https://www.youtube.com/"
r = requests.get(ytlink).content
soup = BeautifulSoup(r, "lxml")
# yield first visible links
for link in soup.select(link_css):
yield urljoin(base, link["href"])
# Load more button
ajax = soup.select(ajax_css)[0]["data-uix-load-more-href"]
while True:
print(ajax)
r = requests.get(urljoin('https://www.youtube.com/', ajax))
# next html is stored in the json.values()
soup = BeautifulSoup("".join(r.json().values()), "lxml")
for link in soup.select(link_css):
yield urljoin(base, link["href"])
ajax = soup.select(ajax_css)
# if empty "Load more" button would be gone
if not ajax:
break
ajax = ajax[0]["data-uix-load-more-href"]
That would give you all 87 links.
In [26]: links = list(get_links())
/browse_ajax?action_continuation=1&continuation=4qmFsgJAEhhVQ1V2b3Vsdnd6Q25VVms3eW9kdUlfR3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%253D%253D
/browse_ajax?action_continuation=1&continuation=4qmFsgJAEhhVQ1V2b3Vsdnd6Q25VVms3eW9kdUlfR3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk03Z0JBQSUzRCUzRA%253D%253D
In [27]: len(links)
Out[27]: 87
In [28]: print(links)
['https://www.youtube.com/watch?v=kjmzIu4VJEY', 'https://www.youtube.com/watch?v=ecRpNV8Xob8', 'https://www.youtube.com/watch?v=mdHoaoAhnMo', 'https://www.youtube.com/watch?v=3oqBKEvdrqE', 'https://www.youtube.com/watch?v=VIbvfOd34-A', 'https://www.youtube.com/watch?v=x4G8ge1VO5s', 'https://www.youtube.com/watch?v=EkW0f2iUOCc', 'https://www.youtube.com/watch?v=Ex2NIeXfYl8', 'https://www.youtube.com/watch?v=XMd4pSX-aVs', 'https://www.youtube.com/watch?v=ZS7KjUjlLWA', 'https://www.youtube.com/watch?v=ZEq9sQJLOgg', 'https://www.youtube.com/watch?v=nSgaCowC5TY', 'https://www.youtube.com/watch?v=nV5Ive_zJT4', 'https://www.youtube.com/watch?v=snThWzMroaA', 'https://www.youtube.com/watch?v=Ud6YhBCucPg', 'https://www.youtube.com/watch?v=1nSfyivyxdg', 'https://www.youtube.com/watch?v=b7hf2wqpUY4', 'https://www.youtube.com/watch?v=cVBvxkVt9wc', 'https://www.youtube.com/watch?v=pcI25yU9yso', 'https://www.youtube.com/watch?v=EMIZZS8HY8A', 'https://www.youtube.com/watch?v=xWD3Zi23rIs', 'https://www.youtube.com/watch?v=M-IbllcTi64', 'https://www.youtube.com/watch?v=U_tW_UxG8bM', 'https://www.youtube.com/watch?v=vQd0mopVnQg', 'https://www.youtube.com/watch?v=mG8NJlsg4rI', 'https://www.youtube.com/watch?v=PsaNY6xpnKY', 'https://www.youtube.com/watch?v=839h3eZMSWA', 'https://www.youtube.com/watch?v=Q_yytPtWmP0', 'https://www.youtube.com/watch?v=oGESQfB9dYM', 'https://www.youtube.com/watch?v=mO5R-1uTJhg', 'https://www.youtube.com/watch?v=wgqLck9SFOc', 'https://www.youtube.com/watch?v=GCaFEsxd-Y8', 'https://www.youtube.com/watch?v=VlpMbnOqP20', 'https://www.youtube.com/watch?v=bj1QT5bxFlA', 'https://www.youtube.com/watch?v=SMtKCu6a7gQ', 'https://www.youtube.com/watch?v=RV6x33mf4WI', 'https://www.youtube.com/watch?v=WhlXuTtmNqE', 'https://www.youtube.com/watch?v=7TWN1G5e-tg', 'https://www.youtube.com/watch?v=jgjeYTkROyk', 'https://www.youtube.com/watch?v=0hFkFoOf-aA', 'https://www.youtube.com/watch?v=yH1u_KQapfw', 'https://www.youtube.com/watch?v=5-l-FGDsbjw', 'https://www.youtube.com/watch?v=sFSgyE64Jjw', 'https://www.youtube.com/watch?v=OhDBtfvv2BM', 'https://www.youtube.com/watch?v=uFgPFi04oTo', 'https://www.youtube.com/watch?v=58a45EfYv1g', 'https://www.youtube.com/watch?v=jtYl5TbK2nc', 'https://www.youtube.com/watch?v=TI-1qxoDRnw', 'https://www.youtube.com/watch?v=Q0M90HqibHI', 'https://www.youtube.com/watch?v=Llb19v7QiXU', 'https://www.youtube.com/watch?v=sqhL_Ms6vuY', 'https://www.youtube.com/watch?v=YFFRgAjXs1Y', 'https://www.youtube.com/watch?v=8eHFG5AACHI', 'https://www.youtube.com/watch?v=_eVOx8Sw9Jg', 'https://www.youtube.com/watch?v=9s_XvG3M-UI', 'https://www.youtube.com/watch?v=lzdO01_tKFo', 'https://www.youtube.com/watch?v=uA2KkxfSW_U', 'https://www.youtube.com/watch?v=29Lt1LQtp5k', 'https://www.youtube.com/watch?v=nfJ9p5iJGz8', 'https://www.youtube.com/watch?v=cjMHd1xVlS0', 'https://www.youtube.com/watch?v=tkZ0FISTxkk', 'https://www.youtube.com/watch?v=bkhD8kYi4MI', 'https://www.youtube.com/watch?v=_bQajpTnOrY', 'https://www.youtube.com/watch?v=XglzEbcjP8c', 'https://www.youtube.com/watch?v=KBszbh6Qwag', 'https://www.youtube.com/watch?v=rVGWndVjCYg', 'https://www.youtube.com/watch?v=AgJxj2cUoyQ', 'https://www.youtube.com/watch?v=TaEVwakp_rI', 'https://www.youtube.com/watch?v=-YnpS-IaYCw', 'https://www.youtube.com/watch?v=sEFSFU2a9CY', 'https://www.youtube.com/watch?v=Jc2aVD4pwnk', 'https://www.youtube.com/watch?v=aY1dOJEv4j4', 'https://www.youtube.com/watch?v=bwjXt2pWoBE', 'https://www.youtube.com/watch?v=Dqn26tWxNsI', 'https://www.youtube.com/watch?v=wiv6JqGhcCU', 'https://www.youtube.com/watch?v=IFi47HLPqoM', 'https://www.youtube.com/watch?v=N1zdWugNdy0', 'https://www.youtube.com/watch?v=ngOBscDs3T4', 'https://www.youtube.com/watch?v=RT5dQVZ-VQY', 'https://www.youtube.com/watch?v=bifExgZW7k0', 'https://www.youtube.com/watch?v=fBEbaEgox1Y', 'https://www.youtube.com/watch?v=wDy9aGFngkY', 'https://www.youtube.com/watch?v=i06Iv0k5fVY', 'https://www.youtube.com/watch?v=2NaRXV7uyPE', 'https://www.youtube.com/watch?v=Hl0nIoLJUU0', 'https://www.youtube.com/watch?v=iXo0T4dRdgA', 'https://www.youtube.com/watch?v=i-7H5Wq0_2Y']
I left the print(ajax) call in so you can see how it changes.
You could use selenium with PhantomJs which would look something like:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException
ytlink = 'https://www.youtube.com/channel/UCUvoulvwzCnUVk7yoduI_Gw/videos'
hrefs = "a.yt-uix-sessionlink.yt-uix-tile-link.spf-link.yt-ui-ellipsis.yt-ui-ellipsis-2"
ajax= "button[data-uix-load-more-href]"
dr = webdriver.PhantomJS()
dr.get(ytlink)
while True:
try:
load_mode_b = dr.find_element_by_css_selector(ajax)
load_mode_b.click()
except StaleElementReferenceException as e:
print(e)
except NoSuchElementException as e:
print(e)
break
Which if we run we see the exact same output:
In [32]: l = [a.get_attribute("href") for a in dr.find_elements_by_css_selector(hrefs)]
In [33]: len(l)
Out[33]: 87
In [34]: print(l)
[u'https://www.youtube.com/watch?v=kjmzIu4VJEY', u'https://www.youtube.com/watch?v=ecRpNV8Xob8', u'https://www.youtube.com/watch?v=mdHoaoAhnMo', u'https://www.youtube.com/watch?v=3oqBKEvdrqE', u'https://www.youtube.com/watch?v=VIbvfOd34-A', u'https://www.youtube.com/watch?v=x4G8ge1VO5s', u'https://www.youtube.com/watch?v=EkW0f2iUOCc', u'https://www.youtube.com/watch?v=Ex2NIeXfYl8', u'https://www.youtube.com/watch?v=XMd4pSX-aVs', u'https://www.youtube.com/watch?v=ZS7KjUjlLWA', u'https://www.youtube.com/watch?v=ZEq9sQJLOgg', u'https://www.youtube.com/watch?v=nSgaCowC5TY', u'https://www.youtube.com/watch?v=nV5Ive_zJT4', u'https://www.youtube.com/watch?v=snThWzMroaA', u'https://www.youtube.com/watch?v=Ud6YhBCucPg', u'https://www.youtube.com/watch?v=1nSfyivyxdg', u'https://www.youtube.com/watch?v=b7hf2wqpUY4', u'https://www.youtube.com/watch?v=cVBvxkVt9wc', u'https://www.youtube.com/watch?v=pcI25yU9yso', u'https://www.youtube.com/watch?v=EMIZZS8HY8A', u'https://www.youtube.com/watch?v=xWD3Zi23rIs', u'https://www.youtube.com/watch?v=M-IbllcTi64', u'https://www.youtube.com/watch?v=U_tW_UxG8bM', u'https://www.youtube.com/watch?v=vQd0mopVnQg', u'https://www.youtube.com/watch?v=mG8NJlsg4rI', u'https://www.youtube.com/watch?v=PsaNY6xpnKY', u'https://www.youtube.com/watch?v=839h3eZMSWA', u'https://www.youtube.com/watch?v=Q_yytPtWmP0', u'https://www.youtube.com/watch?v=oGESQfB9dYM', u'https://www.youtube.com/watch?v=mO5R-1uTJhg', u'https://www.youtube.com/watch?v=wgqLck9SFOc', u'https://www.youtube.com/watch?v=GCaFEsxd-Y8', u'https://www.youtube.com/watch?v=VlpMbnOqP20', u'https://www.youtube.com/watch?v=bj1QT5bxFlA', u'https://www.youtube.com/watch?v=SMtKCu6a7gQ', u'https://www.youtube.com/watch?v=RV6x33mf4WI', u'https://www.youtube.com/watch?v=WhlXuTtmNqE', u'https://www.youtube.com/watch?v=7TWN1G5e-tg', u'https://www.youtube.com/watch?v=jgjeYTkROyk', u'https://www.youtube.com/watch?v=0hFkFoOf-aA', u'https://www.youtube.com/watch?v=yH1u_KQapfw', u'https://www.youtube.com/watch?v=5-l-FGDsbjw', u'https://www.youtube.com/watch?v=sFSgyE64Jjw', u'https://www.youtube.com/watch?v=OhDBtfvv2BM', u'https://www.youtube.com/watch?v=uFgPFi04oTo', u'https://www.youtube.com/watch?v=58a45EfYv1g', u'https://www.youtube.com/watch?v=jtYl5TbK2nc', u'https://www.youtube.com/watch?v=TI-1qxoDRnw', u'https://www.youtube.com/watch?v=Q0M90HqibHI', u'https://www.youtube.com/watch?v=Llb19v7QiXU', u'https://www.youtube.com/watch?v=sqhL_Ms6vuY', u'https://www.youtube.com/watch?v=YFFRgAjXs1Y', u'https://www.youtube.com/watch?v=8eHFG5AACHI', u'https://www.youtube.com/watch?v=_eVOx8Sw9Jg', u'https://www.youtube.com/watch?v=9s_XvG3M-UI', u'https://www.youtube.com/watch?v=lzdO01_tKFo', u'https://www.youtube.com/watch?v=uA2KkxfSW_U', u'https://www.youtube.com/watch?v=29Lt1LQtp5k', u'https://www.youtube.com/watch?v=nfJ9p5iJGz8', u'https://www.youtube.com/watch?v=cjMHd1xVlS0', u'https://www.youtube.com/watch?v=tkZ0FISTxkk', u'https://www.youtube.com/watch?v=bkhD8kYi4MI', u'https://www.youtube.com/watch?v=_bQajpTnOrY', u'https://www.youtube.com/watch?v=XglzEbcjP8c', u'https://www.youtube.com/watch?v=KBszbh6Qwag', u'https://www.youtube.com/watch?v=rVGWndVjCYg', u'https://www.youtube.com/watch?v=AgJxj2cUoyQ', u'https://www.youtube.com/watch?v=TaEVwakp_rI', u'https://www.youtube.com/watch?v=-YnpS-IaYCw', u'https://www.youtube.com/watch?v=sEFSFU2a9CY', u'https://www.youtube.com/watch?v=Jc2aVD4pwnk', u'https://www.youtube.com/watch?v=aY1dOJEv4j4', u'https://www.youtube.com/watch?v=bwjXt2pWoBE', u'https://www.youtube.com/watch?v=Dqn26tWxNsI', u'https://www.youtube.com/watch?v=wiv6JqGhcCU', u'https://www.youtube.com/watch?v=IFi47HLPqoM', u'https://www.youtube.com/watch?v=N1zdWugNdy0', u'https://www.youtube.com/watch?v=ngOBscDs3T4', u'https://www.youtube.com/watch?v=RT5dQVZ-VQY', u'https://www.youtube.com/watch?v=bifExgZW7k0', u'https://www.youtube.com/watch?v=fBEbaEgox1Y', u'https://www.youtube.com/watch?v=wDy9aGFngkY', u'https://www.youtube.com/watch?v=i06Iv0k5fVY', u'https://www.youtube.com/watch?v=2NaRXV7uyPE', u'https://www.youtube.com/watch?v=Hl0nIoLJUU0', u'https://www.youtube.com/watch?v=iXo0T4dRdgA', u'https://www.youtube.com/watch?v=i-7H5Wq0_2Y']

Scrape page with generator

I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1

Categories

Resources