I want to get the link after the redirect of the download link in the article page.
For example:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/
In the above article page, there are the following download links:
https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/yz5cw79mbn3a/ECNHOgoNYk0MIkEoFlUkFlY5Vj5WVSRQACVKfx8EOw8ReVs+FFs=
Open this link directly, it will not redirect to the real download link, you need to open it in the article page.
# coding=utf-8
import lxml
import re
import requests
import sys
from bs4 import BeautifulSoup
from urllib.request import urlopen
def urlopen(url):
'''
using requests to replace urllib.requests.urlopen
return an html
'''
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
return r.text
def generate_pages(subTitle,fromPage,toPage):
'''
return page sites' url list
'''
pages = []
if(fromPage > 0 and fromPage<toPage):
for i in range(fromPage,toPage+1):
pages.append('https://scanlibs.com/category/books'+subTitle+'/page/'+str(i))
return pages
def get_book_sites_of_one_page(page):
'''
get book site's url in one page
input: page site url
output: book site urls list
return book sites in one page
'''
html = urlopen(page)
soup = BeautifulSoup(html,'html.parser')
linkList = soup.find('main').findAll('a',{'rel':'bookmark'})
bookSites= []
for link in linkList[::2]:
if 'href' in link.attrs:
#print(link)
bookSites.append(link.attrs['href'])
return bookSites
def get_book_urls(bookSite):
'''
input a book site
find book downloading urls in this book site
then
return them as a list
'''
bookURLs=[]
html = urlopen(bookSite)
soup = BeautifulSoup(html,'lxml')
linkList = soup.findAll("a",{"target":"_blank"})
for link in linkList[::2]:
# print(link)
if 'href' in link.attrs:
bookURLs.append(link.attrs['href'])
return bookURLs
def get_all_book_urls(fromPage=1, toPage=1, subTitle=''):
bookSites = []
bookURLs = []
pages = generate_pages(subTitle,fromPage, toPage)
for page in pages:
bookSiteOfOnePage=get_book_sites_of_one_page(page)
bookSites.extend(bookSiteOfOnePage)
for bookSite in bookSites:
book_urls=get_book_urls(bookSite)
bookURLs += book_urls
for bookURL in bookURLs:
print(bookURL)
#with open(filename, 'w') as f:
# f.write(bookURLs)
def main():
if(len(sys.argv) == 4):
'''
python getUrl.py 1, 100, programming
from page 1 to page in subject programming
'''
subTitle = str(sys.argv[3])
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
get_all_book_urls(fromPage, toPage, subTitle)
if(len(sys.argv) == 3):
'''
python getUrl.py 1 100
from page 1 to page 100
'''
subTitle = ''
fromPage = int(sys.argv[1])
toPage = int(sys.argv[2])
#filename = subTitle="-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv) == 2):
'''
python getUrl.py 10
from page 10 to page 10
only download books on page 10
'''
fromPage = int(sys.argv[1])
toPage = fromPage + 1
subTitle = ''
#filename = "All-"+str(pageNum)+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
elif(len(sys.argv)== 1):
fromPage = 1
# custom page range
toPage = 2
subTitle = ''
#filename = "All-"+"1"+"-"+time.strftime('%Y-%m-%d', time.localtime())+".txt"
get_all_book_urls(fromPage, toPage, subTitle)
else:
print("Error, too many arguments")
if __name__ == '__main__':
#filename = ''
main()
Thank you for your help!
This website checks if the referer is set while redirecting. You can just give the original url as referer in the header and easily bypass this. You can also see that the referer is used as a url parameter in the final download link.
import requests
from bs4 import BeautifulSoup
s = requests.Session()
url='https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/'
r=html=s.get(url).text
soup=BeautifulSoup(html,'html.parser')
relative_link=soup.find('a',{'id':'download'})['href'] #get the relative link
download_redirect_link=url+relative_link
headers={
"referer": url
}
r2=requests.get(download_redirect_link,headers=headers)
print(r2.url)
Output
https://rapidgator.net/file/80e881f7631eddb49de31e5718eb96ba?referer=https://scanlibs.com/neural-networks-systems-evolutionary-algorithms-2nd/
So, I'm working on a little program that will automatically check and download new music from a bunch of given YouTube channels. I'm currently working on a way to obtain the links of all the uploaded videos each channel has, which I'm doing like a scraper. (Yes, the YouTube API would probably be the proper way to go, but I don't know how to properly use it yet.)
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import urllib.request
ytlink = 'https://www.youtube.com/channel/UCUvoulvwzCnUVk7yoduI_Gw/videos'
r = urllib.request.urlopen(ytlink).read()
soup = BeautifulSoup(r, "html.parser")
links = soup.find_all('a', {"class": "yt-uix-sessionlink yt-uix-tile-link spf-link yt-ui-ellipsis yt-ui-ellipsis-2"})
for tag in links:
link = tag.get('href', None)
if link is not None:
print(link)
This is what I currently have, the problem is, it currently only grabs the first 30 video links since those are the only ones on screen. I've already seen that when the "Load More" button is pressed it executes some Ajax which is initiated by some JavaScript. My question is: How can I get Python to keep triggering the "Load More" button until all uploads are visible?
You can easily mimic the ajax calls and parse the json output returned, we just need to pull the /browse_ajax?action_continuation=... url and keep requesting until it is no longer in the json returned:
from bs4 import BeautifulSoup
import requests
from urlparse import urljoin # python 3 -> from urllib.parse import urljoin
def get_links():
# cretate all css selectors
ytlink = 'https://www.youtube.com/channel/UCUvoulvwzCnUVk7yoduI_Gw/videos'
ajax_css = "button[data-uix-load-more-href]"
link_css = "a.yt-uix-sessionlink.yt-uix-tile-link.spf-link.yt-ui-ellipsis.yt-ui-ellipsis-2"
base = "https://www.youtube.com/"
r = requests.get(ytlink).content
soup = BeautifulSoup(r, "lxml")
# yield first visible links
for link in soup.select(link_css):
yield urljoin(base, link["href"])
# Load more button
ajax = soup.select(ajax_css)[0]["data-uix-load-more-href"]
while True:
print(ajax)
r = requests.get(urljoin('https://www.youtube.com/', ajax))
# next html is stored in the json.values()
soup = BeautifulSoup("".join(r.json().values()), "lxml")
for link in soup.select(link_css):
yield urljoin(base, link["href"])
ajax = soup.select(ajax_css)
# if empty "Load more" button would be gone
if not ajax:
break
ajax = ajax[0]["data-uix-load-more-href"]
That would give you all 87 links.
In [26]: links = list(get_links())
/browse_ajax?action_continuation=1&continuation=4qmFsgJAEhhVQ1V2b3Vsdnd6Q25VVms3eW9kdUlfR3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk1yZ0JBQSUzRCUzRA%253D%253D
/browse_ajax?action_continuation=1&continuation=4qmFsgJAEhhVQ1V2b3Vsdnd6Q25VVms3eW9kdUlfR3caJEVnWjJhV1JsYjNNZ0FEZ0JZQUZxQUhvQk03Z0JBQSUzRCUzRA%253D%253D
In [27]: len(links)
Out[27]: 87
In [28]: print(links)
['https://www.youtube.com/watch?v=kjmzIu4VJEY', 'https://www.youtube.com/watch?v=ecRpNV8Xob8', 'https://www.youtube.com/watch?v=mdHoaoAhnMo', 'https://www.youtube.com/watch?v=3oqBKEvdrqE', 'https://www.youtube.com/watch?v=VIbvfOd34-A', 'https://www.youtube.com/watch?v=x4G8ge1VO5s', 'https://www.youtube.com/watch?v=EkW0f2iUOCc', 'https://www.youtube.com/watch?v=Ex2NIeXfYl8', 'https://www.youtube.com/watch?v=XMd4pSX-aVs', 'https://www.youtube.com/watch?v=ZS7KjUjlLWA', 'https://www.youtube.com/watch?v=ZEq9sQJLOgg', 'https://www.youtube.com/watch?v=nSgaCowC5TY', 'https://www.youtube.com/watch?v=nV5Ive_zJT4', 'https://www.youtube.com/watch?v=snThWzMroaA', 'https://www.youtube.com/watch?v=Ud6YhBCucPg', 'https://www.youtube.com/watch?v=1nSfyivyxdg', 'https://www.youtube.com/watch?v=b7hf2wqpUY4', 'https://www.youtube.com/watch?v=cVBvxkVt9wc', 'https://www.youtube.com/watch?v=pcI25yU9yso', 'https://www.youtube.com/watch?v=EMIZZS8HY8A', 'https://www.youtube.com/watch?v=xWD3Zi23rIs', 'https://www.youtube.com/watch?v=M-IbllcTi64', 'https://www.youtube.com/watch?v=U_tW_UxG8bM', 'https://www.youtube.com/watch?v=vQd0mopVnQg', 'https://www.youtube.com/watch?v=mG8NJlsg4rI', 'https://www.youtube.com/watch?v=PsaNY6xpnKY', 'https://www.youtube.com/watch?v=839h3eZMSWA', 'https://www.youtube.com/watch?v=Q_yytPtWmP0', 'https://www.youtube.com/watch?v=oGESQfB9dYM', 'https://www.youtube.com/watch?v=mO5R-1uTJhg', 'https://www.youtube.com/watch?v=wgqLck9SFOc', 'https://www.youtube.com/watch?v=GCaFEsxd-Y8', 'https://www.youtube.com/watch?v=VlpMbnOqP20', 'https://www.youtube.com/watch?v=bj1QT5bxFlA', 'https://www.youtube.com/watch?v=SMtKCu6a7gQ', 'https://www.youtube.com/watch?v=RV6x33mf4WI', 'https://www.youtube.com/watch?v=WhlXuTtmNqE', 'https://www.youtube.com/watch?v=7TWN1G5e-tg', 'https://www.youtube.com/watch?v=jgjeYTkROyk', 'https://www.youtube.com/watch?v=0hFkFoOf-aA', 'https://www.youtube.com/watch?v=yH1u_KQapfw', 'https://www.youtube.com/watch?v=5-l-FGDsbjw', 'https://www.youtube.com/watch?v=sFSgyE64Jjw', 'https://www.youtube.com/watch?v=OhDBtfvv2BM', 'https://www.youtube.com/watch?v=uFgPFi04oTo', 'https://www.youtube.com/watch?v=58a45EfYv1g', 'https://www.youtube.com/watch?v=jtYl5TbK2nc', 'https://www.youtube.com/watch?v=TI-1qxoDRnw', 'https://www.youtube.com/watch?v=Q0M90HqibHI', 'https://www.youtube.com/watch?v=Llb19v7QiXU', 'https://www.youtube.com/watch?v=sqhL_Ms6vuY', 'https://www.youtube.com/watch?v=YFFRgAjXs1Y', 'https://www.youtube.com/watch?v=8eHFG5AACHI', 'https://www.youtube.com/watch?v=_eVOx8Sw9Jg', 'https://www.youtube.com/watch?v=9s_XvG3M-UI', 'https://www.youtube.com/watch?v=lzdO01_tKFo', 'https://www.youtube.com/watch?v=uA2KkxfSW_U', 'https://www.youtube.com/watch?v=29Lt1LQtp5k', 'https://www.youtube.com/watch?v=nfJ9p5iJGz8', 'https://www.youtube.com/watch?v=cjMHd1xVlS0', 'https://www.youtube.com/watch?v=tkZ0FISTxkk', 'https://www.youtube.com/watch?v=bkhD8kYi4MI', 'https://www.youtube.com/watch?v=_bQajpTnOrY', 'https://www.youtube.com/watch?v=XglzEbcjP8c', 'https://www.youtube.com/watch?v=KBszbh6Qwag', 'https://www.youtube.com/watch?v=rVGWndVjCYg', 'https://www.youtube.com/watch?v=AgJxj2cUoyQ', 'https://www.youtube.com/watch?v=TaEVwakp_rI', 'https://www.youtube.com/watch?v=-YnpS-IaYCw', 'https://www.youtube.com/watch?v=sEFSFU2a9CY', 'https://www.youtube.com/watch?v=Jc2aVD4pwnk', 'https://www.youtube.com/watch?v=aY1dOJEv4j4', 'https://www.youtube.com/watch?v=bwjXt2pWoBE', 'https://www.youtube.com/watch?v=Dqn26tWxNsI', 'https://www.youtube.com/watch?v=wiv6JqGhcCU', 'https://www.youtube.com/watch?v=IFi47HLPqoM', 'https://www.youtube.com/watch?v=N1zdWugNdy0', 'https://www.youtube.com/watch?v=ngOBscDs3T4', 'https://www.youtube.com/watch?v=RT5dQVZ-VQY', 'https://www.youtube.com/watch?v=bifExgZW7k0', 'https://www.youtube.com/watch?v=fBEbaEgox1Y', 'https://www.youtube.com/watch?v=wDy9aGFngkY', 'https://www.youtube.com/watch?v=i06Iv0k5fVY', 'https://www.youtube.com/watch?v=2NaRXV7uyPE', 'https://www.youtube.com/watch?v=Hl0nIoLJUU0', 'https://www.youtube.com/watch?v=iXo0T4dRdgA', 'https://www.youtube.com/watch?v=i-7H5Wq0_2Y']
I left the print(ajax) call in so you can see how it changes.
You could use selenium with PhantomJs which would look something like:
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException,StaleElementReferenceException
ytlink = 'https://www.youtube.com/channel/UCUvoulvwzCnUVk7yoduI_Gw/videos'
hrefs = "a.yt-uix-sessionlink.yt-uix-tile-link.spf-link.yt-ui-ellipsis.yt-ui-ellipsis-2"
ajax= "button[data-uix-load-more-href]"
dr = webdriver.PhantomJS()
dr.get(ytlink)
while True:
try:
load_mode_b = dr.find_element_by_css_selector(ajax)
load_mode_b.click()
except StaleElementReferenceException as e:
print(e)
except NoSuchElementException as e:
print(e)
break
Which if we run we see the exact same output:
In [32]: l = [a.get_attribute("href") for a in dr.find_elements_by_css_selector(hrefs)]
In [33]: len(l)
Out[33]: 87
In [34]: print(l)
[u'https://www.youtube.com/watch?v=kjmzIu4VJEY', u'https://www.youtube.com/watch?v=ecRpNV8Xob8', u'https://www.youtube.com/watch?v=mdHoaoAhnMo', u'https://www.youtube.com/watch?v=3oqBKEvdrqE', u'https://www.youtube.com/watch?v=VIbvfOd34-A', u'https://www.youtube.com/watch?v=x4G8ge1VO5s', u'https://www.youtube.com/watch?v=EkW0f2iUOCc', u'https://www.youtube.com/watch?v=Ex2NIeXfYl8', u'https://www.youtube.com/watch?v=XMd4pSX-aVs', u'https://www.youtube.com/watch?v=ZS7KjUjlLWA', u'https://www.youtube.com/watch?v=ZEq9sQJLOgg', u'https://www.youtube.com/watch?v=nSgaCowC5TY', u'https://www.youtube.com/watch?v=nV5Ive_zJT4', u'https://www.youtube.com/watch?v=snThWzMroaA', u'https://www.youtube.com/watch?v=Ud6YhBCucPg', u'https://www.youtube.com/watch?v=1nSfyivyxdg', u'https://www.youtube.com/watch?v=b7hf2wqpUY4', u'https://www.youtube.com/watch?v=cVBvxkVt9wc', u'https://www.youtube.com/watch?v=pcI25yU9yso', u'https://www.youtube.com/watch?v=EMIZZS8HY8A', u'https://www.youtube.com/watch?v=xWD3Zi23rIs', u'https://www.youtube.com/watch?v=M-IbllcTi64', u'https://www.youtube.com/watch?v=U_tW_UxG8bM', u'https://www.youtube.com/watch?v=vQd0mopVnQg', u'https://www.youtube.com/watch?v=mG8NJlsg4rI', u'https://www.youtube.com/watch?v=PsaNY6xpnKY', u'https://www.youtube.com/watch?v=839h3eZMSWA', u'https://www.youtube.com/watch?v=Q_yytPtWmP0', u'https://www.youtube.com/watch?v=oGESQfB9dYM', u'https://www.youtube.com/watch?v=mO5R-1uTJhg', u'https://www.youtube.com/watch?v=wgqLck9SFOc', u'https://www.youtube.com/watch?v=GCaFEsxd-Y8', u'https://www.youtube.com/watch?v=VlpMbnOqP20', u'https://www.youtube.com/watch?v=bj1QT5bxFlA', u'https://www.youtube.com/watch?v=SMtKCu6a7gQ', u'https://www.youtube.com/watch?v=RV6x33mf4WI', u'https://www.youtube.com/watch?v=WhlXuTtmNqE', u'https://www.youtube.com/watch?v=7TWN1G5e-tg', u'https://www.youtube.com/watch?v=jgjeYTkROyk', u'https://www.youtube.com/watch?v=0hFkFoOf-aA', u'https://www.youtube.com/watch?v=yH1u_KQapfw', u'https://www.youtube.com/watch?v=5-l-FGDsbjw', u'https://www.youtube.com/watch?v=sFSgyE64Jjw', u'https://www.youtube.com/watch?v=OhDBtfvv2BM', u'https://www.youtube.com/watch?v=uFgPFi04oTo', u'https://www.youtube.com/watch?v=58a45EfYv1g', u'https://www.youtube.com/watch?v=jtYl5TbK2nc', u'https://www.youtube.com/watch?v=TI-1qxoDRnw', u'https://www.youtube.com/watch?v=Q0M90HqibHI', u'https://www.youtube.com/watch?v=Llb19v7QiXU', u'https://www.youtube.com/watch?v=sqhL_Ms6vuY', u'https://www.youtube.com/watch?v=YFFRgAjXs1Y', u'https://www.youtube.com/watch?v=8eHFG5AACHI', u'https://www.youtube.com/watch?v=_eVOx8Sw9Jg', u'https://www.youtube.com/watch?v=9s_XvG3M-UI', u'https://www.youtube.com/watch?v=lzdO01_tKFo', u'https://www.youtube.com/watch?v=uA2KkxfSW_U', u'https://www.youtube.com/watch?v=29Lt1LQtp5k', u'https://www.youtube.com/watch?v=nfJ9p5iJGz8', u'https://www.youtube.com/watch?v=cjMHd1xVlS0', u'https://www.youtube.com/watch?v=tkZ0FISTxkk', u'https://www.youtube.com/watch?v=bkhD8kYi4MI', u'https://www.youtube.com/watch?v=_bQajpTnOrY', u'https://www.youtube.com/watch?v=XglzEbcjP8c', u'https://www.youtube.com/watch?v=KBszbh6Qwag', u'https://www.youtube.com/watch?v=rVGWndVjCYg', u'https://www.youtube.com/watch?v=AgJxj2cUoyQ', u'https://www.youtube.com/watch?v=TaEVwakp_rI', u'https://www.youtube.com/watch?v=-YnpS-IaYCw', u'https://www.youtube.com/watch?v=sEFSFU2a9CY', u'https://www.youtube.com/watch?v=Jc2aVD4pwnk', u'https://www.youtube.com/watch?v=aY1dOJEv4j4', u'https://www.youtube.com/watch?v=bwjXt2pWoBE', u'https://www.youtube.com/watch?v=Dqn26tWxNsI', u'https://www.youtube.com/watch?v=wiv6JqGhcCU', u'https://www.youtube.com/watch?v=IFi47HLPqoM', u'https://www.youtube.com/watch?v=N1zdWugNdy0', u'https://www.youtube.com/watch?v=ngOBscDs3T4', u'https://www.youtube.com/watch?v=RT5dQVZ-VQY', u'https://www.youtube.com/watch?v=bifExgZW7k0', u'https://www.youtube.com/watch?v=fBEbaEgox1Y', u'https://www.youtube.com/watch?v=wDy9aGFngkY', u'https://www.youtube.com/watch?v=i06Iv0k5fVY', u'https://www.youtube.com/watch?v=2NaRXV7uyPE', u'https://www.youtube.com/watch?v=Hl0nIoLJUU0', u'https://www.youtube.com/watch?v=iXo0T4dRdgA', u'https://www.youtube.com/watch?v=i-7H5Wq0_2Y']
I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1