Python - getting a unique list - python

I am using the code below:
import requests
from bs4 import BeautifulSoup
def recursiveUrl(url, link, depth):
if depth == 5:
return url
else:
print(link['href'])
page = requests.get(url + link['href'])
soup = BeautifulSoup(page.text, 'html.parser')
newlink = soup.find('a')
if len(newlink) == 0:
return link
else:
return link, recursiveUrl(url, newlink, depth + 1)
def getLinks(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.find_all('a')
for link in links:
links.append(recursiveUrl(url, link, 0))
return links
links = getLinks("https://www.rogerebert.com/reviews/")
def unique(links):
uniqueValues = {}
for i in links:
uniqueValues.add(i)
for i in uniqueValues:
print(i)
unique(links)
I have tried a number of ways of trying to print only unique entries but my output is a long list like the below I should ideally only print on of each unique entry:
Thanks again for all the help.

You have a mistake in your code uniqueValues.add(i) as you set it to dict
previously and dict has no add !
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.rogerebert.com/reviews/')
soup = BeautifulSoup(r.text, 'html.parser')
links = set()
for item in soup.findAll('a'):
item = item.get('href')
links.add(item)
for item in links:
print(item)

Instead of using a list try using a set. That way you don't have multiple instances of the same website.
uniqueValues = {}
for i in links:
uniqueValues.add(i)
for i in uniqueValues:
print(i)

Related

Why do I get output from one but not the other?

from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
def correct_url(url1):
if not url1.startswith('https://www.parliament.gov.sg'):
url1 = f'https://www.parliament.gov.sg{url1}'
return url1
url_to_scrape = 'https://www.parliament.gov.sg/mps/list-of-current-mps'
links = []
while True:
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a', attrs={'href': re.compile("/details/")}) == []:
break
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(correct_url(link.get('href')))
for link in links:
url = link
member_info = 'mp-designation-wrap'
**member_info = 'mp-constituency-wrap'**
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt1 = soup.find('div', attrs={'class': member_info})
textoutput = txt1.text
print(textoutput)
break
I'm trying to separate the different categories to use save separately, however, I only get output when using the member_info = 'mp-designation-wrap' and I get a AttributeError: 'NoneType' object has no attribute 'text' when using 'mp-constituency-wrap'.
I do not understand why it is giving me different results and it would be great if someone could help me understand why it is so and point me in the right direction
Reason why you get this error is, that the element you try to select do not exist in some of your resources, so you have to check that before calling .text.
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
text1 = e.text if (e := soup.find('div', attrs={'class': 'mp-designation-wrap'})) else None
text2 = e.text if (e := soup.find('div', attrs={'class': 'mp-constituency-wrap'})) else None
print(text2)

how to edit a link that is stored in a list

import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
url_to_scrape = 'https://www.parliament.gov.sg/about-us/structure/the-cabinet'
links = []
while True:
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a', attrs={'href': re.compile("/details/")}) == []:
break
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(link.get('href'))
print(links)
Currently, this is the code that I have, which gives me an output list of
['/mps/current-list-of-mps/mp/details/lee-hsien-loong', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/heng-swee-keat', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/teo-chee-hean', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/tharman-shanmugaratnam', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/ng-eng-hen', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/vivian-balakrishnan', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/k-shanmugam', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/gan-kim-yong', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/s-iswaran', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/grace-fu-hai-yien', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/chan-chun-sing', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/lawrence-wong', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/masagos-zulkifli-bin-masagos-mohamad', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/ong-ye-kung', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/desmond-lee', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/josephine-teo', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/indranee-rajah', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/mohamad-maliki-bin-osman', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/edwin-tong-chun-fai', 'https://www.parliament.gov.sg/mps/list-of-current-mps/mp/details/tan-see-leng']
With the next part of my code, I am trying to scrape data from each of these links, however, as the first link within the list doesn't come out as a valid url, I am not able to obtain information from it.
How can I edit it such that it will be the same as the other urls in the list?
many thanks
Before you add the string to the list you can check if he has the right format by using this code and correct it if needed:
def correct_url(url):
if not url.startswith('https://www.parliament.gov.sg'):
url = f'https://www.parliament.gov.sg{url}'
return URL
the for loop adopted to the new function:
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(correct_url(link.get('href')))
print(links)

python crawl one page

I tried to extract extract links (href) which start with a specific word, but it returns empty list even if I have a lot of links in the page source who satisfy the condition, I am definitely missing something, below is my code:
import requests
from bs4 import BeautifulSoup
import string
import os
import re
def extract_href_page(page):
soup = BeautifulSoup(page)
all_links = []
links = soup.find_all('a', pattern = re.compile(r'\w*first_word'))
# pattern = re.compile(r'\w*recette')
print(links)
for link in links:
all_links.append(link['href']) # Save href only, for example.
return all_links
for page_number in range(1, 63):
requete = requests.get ("https://www.website.com/pages/"+ "page".capitalize()+ "-" + str(page_number) + ".html")
page = requete.content
list_links = extract_href_page(page)
print(list_links)
for link in list_links:
print(link)
Try this:
import requests
from bs4 import BeautifulSoup
import string
import os
import re
def extract_href_page(page):
soup = BeautifulSoup(page)
all_links = []
links = soup.find_all('a', href=True)
# pattern = re.compile(r'\w*recette')
print(links)
for link in links:
if re.match(r"\w*first_word", link["href"], re.I):
all_links.append(link.get("href"))
...

Convert a result in the console into a list Python

I want to web scrape a list of urls from a web site and then open them one by one.
I can get the list of all urls but then try to turn into a list things get wrong.
When I print the list, intead of geting [url1, urls2...] I get something like this in the console:
[url1,url2,url3] dif line
[url1,url2,url3,url4] difline
[url1,url2,url3,url4,url5]
Find the my script bellow:
driver = webdriver.Chrome()
my_url="https://prog.nfz.gov.pl/app-jgp/AnalizaPrzekrojowa.aspx"
driver.get(my_url)
time.sleep(3)
content = driver.page_source.encode('utf-8').strip()
page_soup = soup(content,"html.parser")
links =[]
for link in page_soup.find_all('a', href=True):
url=link['href']
ai=str(url)
links.append(ai)
print(links)
links.append(ai)
print(links)
I have rewritten your code a little. First you need to load and scrap main page to get all links from "href". After that just use scraped urls in a loop to get next pages.
Also there is some junk in "href" which isn't url so you have to clean it first.
I prefer requests to do GET.
http://docs.python-requests.org/en/master/
I hope it helps.
from bs4 import BeautifulSoup
import requests
def main():
links = []
url = "https://prog.nfz.gov.pl/app-jgp/AnalizaPrzekrojowa.aspx"
web_page = requests.get(url)
soup = BeautifulSoup(web_page.content, "html.parser")
a_tags = soup.find_all('a', href=True)
for a in a_tags:
links.append(a.get("href"))
print(links) # just to demonstrate that links are there
cleaned_list = []
for link in links:
if "http" in link:
cleaned_list.append(link)
print(cleaned_list)
return cleaned_list
def load_pages_from_links(urls):
user_agent = {'User-agent': 'Mozilla/5.0'}
links = urls
downloaded_pages = {}
if len(links) == 0:
return "There are no links."
else:
for nr, link in enumerate(links):
web_page = requests.get(link, headers=user_agent)
downloaded_pages[nr] = web_page.content
print(downloaded_pages)
if __name__ == "__main__":
links = main()
load_pages_from_links(links)

Extract links from html page

I am trying to fetch all movie/show netflix links from here http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html and also their country name. e.g from the page source, I want http://www.netflix.com/WiMovie/80048948, USA, etc. I have done the following. But it returns all links instead of the netflix ones I want. I am a little new to regex. How should I go about this?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
##reqlink = re.search('netflix',link.get('href'))
##if reqlink:
print link.get('href')
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
If I uncomment the lines above, I get the following error:
TypeError: expected string or buffer
What should I do?
from BeautifulSoup import BeautifulSoup
import urllib2
import re
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
title=[]
country=[]
for line in r.iter_lines():
if count == 746:
urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(line)
for link in soup.findAll('a', href=re.compile('netflix')):
title.append(link.get('href'))
for link in soup.findAll('img'):
print link.get('alt')
country.append(link.get('alt'))
count = count + 1
print len(title), len(country)
The previous error has been worked upon. Now the only thing to look for is films with multiple countries. How to get them together.
e.g. for 10.0 Earthquake, link = http://www.netflix.com/WiMovie/80049286, country = UK, USA.
Your code can be simplified to a couple of selects:
import requests
from bs4 import BeautifulSoup
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
for a in soup.select("a[href*=netflix]"):
print(a["href"])
And for the img:
co = {"UK", "USA"}
for img in soup.select("img[alt]"):
if img["alt"] in co:
print(img)
As for the first question - it failed for links that didn't have an href value. So instead of a string you got None.
The following works:
from BeautifulSoup import BeautifulSoup
import urllib2
import re
html_page = urllib2.urlopen('http://netflixukvsusa.netflixable.com/2016/
07/complete-alphabetical-list-k-sat-jul-9.html')
soup = BeautifulSoup(html_page)
for link in soup.findAll('a'):
link_href = link.get('href')
if link_href:
reqlink = re.search('netflix',link_href)
if reqlink:
print link_href
for link in soup.findAll('img'):
if link.get('alt') == 'UK' or link.get('alt') == 'USA':
print link.get('alt')
As for the second question, I would recommend having a dictionary between the movie to a list of countries that it appears in, then it would be easier to format it in a string the way you want.
I think you'd have an easier iterating through the listing rows and using a generator to assemble the data structure you're looking for (ignore the minor differences in my code, I'm using Python3):
from bs4 import BeautifulSoup
import requests
url = 'http://netflixukvsusa.netflixable.com/2016/07/' \
'complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url)
soup = BeautifulSoup(r.content)
rows = soup.select('span[class="listings"] tr')
def get_movie_info(rows):
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
yield link, countries
print('\n'.join(map(str, get_movie_info(rows))))
Edit: Or if you're looking for a dict instead of a list:
def get_movie_info(rows):
output = {}
netflix_url_prefix = 'http://www.netflix.com/'
for row in rows:
link = row.find('a',
href=lambda href: href and netflix_url_prefix in href)
if link is not None:
name = link.text
link = link['href']
countries = [img['alt'] for img in row('img', class_='flag')]
output[name or 'some_default'] = {'link': link, 'countries': countries}
return output
print('\n'.join(map(str, get_movie_info(rows).items())))
url = 'http://netflixukvsusa.netflixable.com/2016/07/complete-alphabetical-list-k-sat-jul-9.html'
r = requests.get(url, stream=True)
count = 1
final=[]
for line in r.iter_lines():
if count == 746:
soup = BeautifulSoup(line)
for row in soup.findAll('tr'):
url = row.find('a', href=re.compile('netflix'))
if url:
t=url.string
u=url.get('href')
one=[]
for country in row.findAll('img'):
one.append(country.get('alt'))
final.append({'Title':t,'Url':u,'Countries':one})
count = count + 1
final is the final list.

Categories

Resources