Extract links from html page - python

I am trying to fetch all movie/show netflix links from here http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html and also their country name. e.g from the page source, I want http://www.netflix.com/WiMovie/80048948, USA, etc. I have done the following. But it returns all links instead of the netflix ones I want. I am a little new to regex. How should I go about this?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
##reqlink = re.search('netflix',link.get('href'))
##if reqlink:
print link.get('href')
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
If I uncomment the lines above, I get the following error:
TypeError: expected string or buffer
What should I do?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
title=[]
country=[]
for line in r.iter_lines():
if count == 746:
urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(line)
for link in soup.findAll('a', href=re.compile('netflix')):
title.append(link.get('href'))
for link in soup.findAll('img'):
print link.get('alt')
country.append(link.get('alt'))
count = count + 1
print len(title), len(country)
The previous error has been worked upon. Now the only thing to look for is films with multiple countries. How to get them together.
e.g. for 10.0 Earthquake, link = http://www.netflix.com/WiMovie/80049286, country = UK, USA.

Your code can be simplified to a couple of selects:
import requests
from bs4 import BeautifulSoup
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
for a in soup.select("a[href*=netflix]"):
print(a["href"])
And for the img:
co = {"UK", "USA"}
for img in soup.select("img[alt]"):
if img["alt"] in co:
print(img)

As for the first question - it failed for links that didn't have an href value. So instead of a string you got None.
The following works:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/
07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
link_href = link.get('href')
if link_href:
reqlink = re.search('netflix',link_href)
if reqlink:
print link_href
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
As for the second question, I would recommend having a dictionary between the movie to a list of countries that it appears in, then it would be easier to format it in a string the way you want.

I think you'd have an easier iterating through the listing rows and using a generator to assemble the data structure you're looking for (ignore the minor differences in my code, I'm using Python3):
from bs4 import BeautifulSoup
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/' \
'complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
rows = soup.select('span[class="listings"] tr')
def get_movie_info(rows):
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
yield link, countries
print('\n'.join(map(str, get_movie_info(rows))))
Edit: Or if you're looking for a dict instead of a list:
def get_movie_info(rows):
output = {}
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
name = link.text
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
output[name or 'some_default'] = {'link': link, 'countries': countries}
return output
print('\n'.join(map(str, get_movie_info(rows).items())))

url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
final=[]
for line in r.iter_lines():
if count == 746:
soup = BeautifulSoup(line)
for row in soup.findAll('tr'):
url = row.find('a', href=re.compile('netflix'))
if url:
t=url.string
u=url.get('href')
one=[]
for country in row.findAll('img'):
one.append(country.get('alt'))
final.append({'Title':t,'Url':u,'Countries':one})
count = count + 1
final is the final list.

Related

anyone please guide me how can i do web scarping multiple pages of booking.com -

This is the link url
url = 'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_c
Hotel_name = doc.find_all("div",{'class' : "fcab3ed991 a23c043802"})
this gives me the result of all hotel names in page number, 1, but how can I get the hotel names of all the pages?
I've tried this
import requests
from bs4 import BeautifulSoup
# Initialize the page number
page_number = 0
while True:
# Increment the page number
page_number += 1
# Make the GET request to the URL
url = f"https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={page_number*15}"
response = requests.get(url)
# Parse the HTML content
soup = BeautifulSoup(response.content, 'html.parser')
# Extract the hotel information
hotels = soup.find_all('div', {'class' : "fcab3ed991 a23c043802"})
if not hotels:
break
for hotel in hotels:
price = hotel.find('div', {' data-testid="title'}).text
print(f"{price}")
but it gives me an empty list as an output.
Avoid selecting elements by classes that looks highly dynamic and use HTML structure instead. Check the number of total results and use it in range() to iterate the results.
Example
import requests, re
from bs4 import BeautifulSoup
data = []
soup = BeautifulSoup(
requests.get('https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15',
headers={'user-agent':'some agent'}
).text)
num_results = int(re.search(r'\d+',soup.select_one('div:has(+[data-testid="pagination"])').text).group(0))
for i in range(0,int(num_results/25)):
soup = BeautifulSoup(
requests.get(f'https://www.booking.com/searchresults.html?label=gen173nr-1FCAEoggI46AdIM1gEaGyIAQGYATG4ARfIAQzYAQHoAQH4AQKIAgGoAgO4AuS4sJ4GwAIB0gIkYWJlYmZiMWItNWJjMi00M2Y2LTk3MGUtMzI2ZGZmMmIyNzMz2AIF4AIB&aid=304142&dest_id=-2092174&dest_type=city&group_adults=2&req_adults=2&no_rooms=1&group_children=0&req_children=0&nflt=ht_id%3D204&rows=15&offset={int(i*25)}',
headers={'user-agent':'some agent'}
).text
)
data.extend([e.select_one('[data-testid="title"]').text for e in soup.select('[data-testid="property-card"]')])
data

Web scraping doesn't iterate over entire webpage

Im trying to scrape the information of all the player names and player rating from this website:
https://www.fifaindex.com/players/?gender=0&league=1&order=desc
But i only get the information from the first player on the page.
The code im using:
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('div', class_="responsive-table table-rounded")
for result in results:
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)
The HTML parsed is attached in the picture
I tinkered around a little bit and I think I got a version that does what you want
from bs4 import BeautifulSoup
import requests
page = requests.get("https://www.fifaindex.com/players/?
gender=0&league=1&order=desc")
soup = BeautifulSoup(page.content, "html.parser")
results = soup.find_all("tr")
for result in results:
try:
result["data-playerid"]
except KeyError:
continue
rating = result.find("span", class_="badge badge-dark rating r3").text
name = result.find("a", class_="link-player")
info = [rating, name]
print(info)
Get all table lines with a data-playerid attribute will fix it:
#!/usr/bin/env python3
from bs4 import BeautifulSoup
import requests
url = "https://www.fifaindex.com/players/?gender=0&league=1&order=desc"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
results = soup.find_all('tr', {'data-playerid': True})
for res in results:
rating = res.find("span", class_="badge badge-dark rating r3").text
name = res.find("a", class_="link-player")
info = [rating, name]
print(info)

scraping links from wikipedia

So i am trying to scrape links from a random wikipedia page here is my code thus far:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import urllib2
# function get random page
def get_random():
import requests
# r = requests.get('https://en.wikipedia.org/wiki/Special:Random')
r = requests.get('https://en.wikipedia.org/wiki/Carole_Ann')
return r.url
#========================
#finding the valid link
def validlink(href):
if href:
if re.compile('^/wiki/').search(href):
if not re.compile('/\w+:').search(href):
return True
return False
#validlink()===========
#the first site
a1 = get_random()
#print("the first site is: " + a1)
# the first site end()====
#looking for the article name:
blin = requests.get(a1)
soup = BeautifulSoup(blin.text, 'html.parser')
title = soup.find('h1', {'class' : 'firstHeading'})
print("starting website: " + a1 + " Titled: " + title.text)
print("")
#=============================
#first article done
#find body:
import re
body = requests.get(a1).text
soup = BeautifulSoup(body, 'lxml')
for link in soup.findAll("a"):
url = link.get("href", "")
print(
#======================
i know i'm doing this last part wrong. Im new to python so i just have no idea how to go about this part, what i need is to pull all of the links from a random site that the random page takes me to, then i pull the link and title off of that site,
then i need to pull the wikipedia links off of that page which is what i am looking to do in that last bit of code there heres another snip:
and at this point i want to print all of the links that it finds after they have been tested against my valid links function at the top:
again forgive me for being new and not understanding at this. But please help i cannot figure this out.
so the question that i have is: i need to create a snippet of code that will pull out all of the website links off of the wikipedia page (which note i still dont know how to do the for loop was my best guess based on my own research) then i need to test the links that i pulled against my validlink function, and print out all of the valid links.
If you whan it as list then create new list and append() url if it is valid.
Because the same url can be many times on page so I also check if url is already on list.
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and validlink(url):
valid_urls.append(url)
print(valid_urls)
from bs4 import BeautifulSoup
import requests
import re
# --- functions ---
def is_valid(url):
"""finding the valid link"""
if url:
if url.startswith('/wiki/'): # you don't need `re` to check it
if not re.compile('/\w+:').search(url):
return True
return False
# --- main ---
#random_url = 'https://en.wikipedia.org/wiki/Special:Random'
random_url = 'https://en.wikipedia.org/wiki/Carole_Ann'
r = requests.get(random_url)
print('url:', r.url)
soup = BeautifulSoup(r.text, 'html.parser')
title = soup.find('h1', {'class': 'firstHeading'})
print('starting website:', r.url)
print('titled:', title.text)
print()
valid_urls = []
for link in soup.find_all('a'): # find_all('a', {'href': True}):
url = link.get('href', '')
if url not in valid_urls and is_valid(url):
valid_urls.append(url)
#print(valid_urls)
#for url in valid_urls:
# print(url)
print('\n'.join(valid_urls))

Python - getting a unique list

I am using the code below:
import requests
from bs4 import BeautifulSoup
def recursiveUrl(url, link, depth):
if depth == 5:
return url
else:
print(link['href'])
page = requests.get(url + link['href'])
soup = BeautifulSoup(page.text, 'html.parser')
newlink = soup.find('a')
if len(newlink) == 0:
return link
else:
return link, recursiveUrl(url, newlink, depth + 1)
def getLinks(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.find_all('a')
for link in links:
links.append(recursiveUrl(url, link, 0))
return links
links = getLinks("https://www.rogerebert.com/reviews/")
def unique(links):
uniqueValues = {}
for i in links:
uniqueValues.add(i)
for i in uniqueValues:
print(i)
unique(links)
I have tried a number of ways of trying to print only unique entries but my output is a long list like the below I should ideally only print on of each unique entry:
Thanks again for all the help.
You have a mistake in your code uniqueValues.add(i) as you set it to dict
previously and dict has no add !
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.rogerebert.com/reviews/')
soup = BeautifulSoup(r.text, 'html.parser')
links = set()
for item in soup.findAll('a'):
item = item.get('href')
links.add(item)
for item in links:
print(item)
Instead of using a list try using a set. That way you don't have multiple instances of the same website.
uniqueValues = {}
for i in links:
uniqueValues.add(i)
for i in uniqueValues:
print(i)

How do I simultaneously scrape two pages and produce two distinct lists within one nested 'for-loop'?

I'm scraping from two URLs that have the same DOM structure, and so I'm trying to find a way to scrape both of them at the same time.
The only caveat is that the data scraped from both these pages need to end up on distinctly named lists.
To explain with example, here is what I've tried:
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
ws_list = []
ws48_list = []
categories = [ws_list, ws48_list]
for url in urls:
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
for cat_list in categories:
cat_list.append(player_name)
print(ws48_list)
print(ws_list)
This ends up printing two identical lists when I was shooting for 2 lists unique to its page.
How do I accomplish this? Would it be better practice to code it another way?
Instead of trying to append to already existing lists. Just create new ones. Make a function to do the scrape and pass each url in turn to it.
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
def parse_page(url, headers={}):
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
return [a.text for a in section.find_all('a')]
ws_list, ws48_list = [parse_page(url) for url in urls]
print('ws_list = %r' % ws_list)
print('ws8_list = %r' % ws48_list)
Just add them to the appropriate list and the problem is solved?
for i, url in enumerate(urls):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
categories[i].append(player_name)
print(ws48_list)
print(ws_list)
You can use a function to define your scraping logic, then just call it for your urls.
import os
import requests
from bs4 import BeautifulSoup as bs
def scrape(url):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
names = []
for a in section.find_all('a'):
player_name = a.text
names.append(player_name)
return names
ws_list = scrape('https://www.basketball-reference.com/leaders/ws_career.html')
ws48_list = scrape('https://www.basketball-reference.com/leaders/ws_per_48_career.html')
print(ws_list)
print(ws48_list)

Categories

Resources