Trouble scraping Wikipedia articles using Python - python

This is a Wikipedia article containing a list of articles about notable computer scientists. I have to write a script that collects the following info for each one of them:
Their full name
The number of awards they have
The universities they've attended
I've already written the following code to gather the links to each article.
import requests
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_computer_scientists"
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
lines = soup.find(id="mw-content-text").find_all("li")
valid_links = []
for line in lines:
link = line.find("a")
if link['href'].find("/wiki/") == -1:
continue
if link.text == "Lists portal":
break
valid_links.append("https://en.wikipedia.org" + link['href'])
It's also pretty easy to get their full name (it's just the tile for each one). However I'm having trouble writing a script that can get 2 & 3 correctly for each one.
What I have so far is the following:
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
scientist_name = soup.find(id="firstHeading").string
soup.find(id="mw-content-text").find("table", class_="infobox biography vcard")
scientist_education = "PLACEHOLDER"
scientist_awards = "PLACEHOLDER"

Can you try with the following code:
import requests
import re
from bs4 import BeautifulSoup
URL = "https://en.wikipedia.org/wiki/List_of_computer_scientists"
response = requests.get(URL)
soup = BeautifulSoup(response.content, 'html.parser')
lines = soup.find(id="mw-content-text").find_all("li")
valid_links = []
for line in lines:
link = line.find("a")
if link['href'].find("/wiki/") == -1:
continue
if link.text == "Lists portal":
break
valid_links.append("https://en.wikipedia.org" + link['href'])
for url in valid_links:
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
name = soup.find(id="firstHeading").string
edu = soup.find(lambda tag: len(tag.find_all()) == 0 and "Institutions" in tag.text)
edux = [i.text.strip() for i in edu.find_next_siblings("td")] if edu else []
awards = soup.find(lambda tag: len(tag.find_all()) == 0 and "Awards" in tag.text)
awardsx = [i.text.strip() for i in awards.find_next_siblings("td")] if awards else []
res = {"name": name, "education": edux, "awards": awardsx}
print(res)
It returns the following output:
{'name': 'Atta ur Rehman Khan', 'education': ['Ajman University King Saud University University of Malaya Sohar University COMSATS University Air University (Pakistan Air Force) Qurtuba University'], 'awards': []}
{'name': 'Wil van der Aalst', 'education': ['RWTH Aachen University'], 'awards': []}
{'name': 'Scott Aaronson', 'education': ['University of Texas at Austin\nMassachusetts Institute of Technology\nInstitute for Advanced Study\nUniversity of Waterloo'], 'awards': ['Alan T. Waterman Award\nPECASE\nTomassoni–Chisesi Prize\nACM Prize in Computing']}
{'name': 'Rediet Abebe', 'education': ['University of California, BerkeleyHarvard UniversityCornell UniversityUniversity of Cambridge'], 'awards': ['Andrew Carnegie Fellow (2022)Harvard Society of Fellows (2019)MIT Technology Review Innovators Under 35 (2019)']}
....
However, I believe that there are better options for crawling this page, such as Scrapy. Additionally, if that is the case, you could run your spiders on the cloud using a service like estela.

Related

Web scraping with BS4

I have a problem with scraping some basic info about movies from imdb.com. I want my program to get title and description of a movie from a given URL. The title part is doing its job, however I can't figure out how to get the description. Here's my code:
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={'Accept-Language': 'en-US,en;q=0.5'})
if not r or 'https://www.imdb.com/title' not in url:
return print('Invalid movie page!')
return r.content
if __name__ == '__main__':
# print('Input the URL:')
# link = input()
link = 'https://www.imdb.com/title/tt0111161'
data = get_data(link)
soup = bs(data, 'html.parser')
title = ' '.join(soup.find('h1').text.split()[:-1])
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
movie_info = {'title': title, 'description': desc}
print(movie_info)
When I run it I get an error:
Exception has occurred: AttributeError
'NoneType' object has no attribute 'text'
File "movie-scraper.py", line 18, in <module>
desc = soup.find('p', {'data-testid':"plot", 'class':"GenresAndPlot__Plot-cum89p-8 kmrpno"}).text
How do I access the description properly?
To get plot summary, change the selector to find class="plot_summary":
import requests
from bs4 import BeautifulSoup as bs
def get_data(url):
r = requests.get(url, headers={"Accept-Language": "en-US,en;q=0.5"})
if not r or "https://www.imdb.com/title" not in url:
return print("Invalid movie page!")
return r.content
if __name__ == "__main__":
link = "https://www.imdb.com/title/tt0111161"
data = get_data(link)
soup = bs(data, "html.parser")
title = " ".join(soup.find("h1").text.split()[:-1])
desc = soup.find("div", class_="plot_summary").get_text(strip=True) # <-- change this to find class="plot_summary"
movie_info = {"title": title, "description": desc}
print(movie_info)
Prints:
{'title': 'The Shawshank Redemption', 'description': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.Director:Frank DarabontWriters:Stephen King(short story "Rita Hayworth and Shawshank Redemption"),Frank Darabont(screenplay)Stars:Tim Robbins,Morgan Freeman,Bob Gunton|See full cast & crew»'}

loaction of text inside Div tag is changing

Here is link to website: https://www.ohorse.com/stables/
I want to extract the address from every div as circled in the screenshot below:
Write the following code
from requests import get
from bs4 import BeautifulSoup
from bs4 import NavigableString, Tag
url = 'https://www.ohorse.com/stables/'
resp = get(url)
soup = BeautifulSoup(resp.text, 'lxml')
all_divs = soup.findAll('div', class_ = 'contentright')
for div in all_divs:
# print(div.find('a', class_ = 'listing').get('href'))
sub_divs = div.findAll('div', class_ = 'listing_content')
for s_div in sub_divs:
add = list(s_div.children)[0]
add2 = list(s_div.children)[2]
print(add)
print(add2)
And got this output:
As one the very first line I got a image tag because in first div there is Facebook link is given instead of address and it do not return the address.
second with this I got some tags how I can I apply condition on tag that if list have any tag so I can pass it.
I just want a solution so I can extract address of every in standard form.
import requests
from bs4 import BeautifulSoup
def main(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
target = soup.findAll('div', class_='listing_content')
for tar in target:
tar = list(tar.strings)[:4]
print(list(dict.fromkeys(tar)))
main("https://www.ohorse.com/stables/")
Output:
["Visit Houghton College Equestrian Center's Facebook Page", '9823 School Farm Rd', 'Houghton, NY 14744', '(585) 567-8142']
['HC80 Box 16', 'Burwell, NE 68823', '(308) 346-5530']
['1500 Kings Gap Road', 'Pine Mountain, GA 31811', '(229) 886-1709']
['6280 Taylor Ranch Loop', 'Kaufman, TX 75142', '(972) 467-4053']
['28424 Hegar Rd', 'Hockley, TX 77447', '(281) 702-2048']
['28424 Hegar Rd', 'Hockley, TX 77447', '(936) 931-1188']
['1409 US Hwy 59', 'Garvin, MN 56132', '(507) 629-4401']
['1911 De La Vina St', 'Santa Barbara, CA 93101', '(805) 448-4896']
['Shawnee, KS 66216', '(913) 963-8212', 'brian#3sevensranch.com']
['11127 Orcas Ave', 'Lake View Terrace, CA 91342', '(818) 899-9221']

I want to scrape urls of all titles using python

I wrote a code to get all the title urls but have some issues like it displays None values. So could you please help me out?
Here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_index_data(soup):
try:
titles_link = soup.find_all('div',class_="marginTopTextAdjuster")
except:
titles_link = []
urls = [item.get('href') for item in titles_link]
print(urls)
def main():
#url = "http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1"
mainurl = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
#get_page(url)
get_index_data(get_page(mainurl))
#write_csv(data,url)
if __name__ == '__main__':
main()
You are trying to get the href attribute of the div tag. Instead try selecting all the a tags. They seem to have a common class attribute body_link_11.
Use titles_link = soup.find_all('a',class_="body_link_11") instead of titles_link = soup.find_all('div',class_="marginTopTextAdjuster")
url = "http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
titles_link = []
titles_div = soup.find_all('div', attrs={'class': 'marginTopTextAdjuster'})
for link in titles_div:
tag = link.find_all('a', href=True)
try:
if tag[0].attrs.get('item_id', None):
titles_link.append({tag[0].text: tag[0].attrs.get('href', None)})
except IndexError:
continue
print(titles_link)
output:
[{'Civil Affairs Handbook, Japan, section 1a: population statistics.': '/cdm/singleitem/collection/p4013coll8/id/2653/rec/1'}, {'Army Air Forces Program 1943.': '/cdm/singleitem/collection/p4013coll8/id/2385/rec/2'}, {'Casualty report number II.': '/cdm/singleitem/collection/p4013coll8/id/3309/rec/3'}, {'Light armored division, proposed March 1943.': '/cdm/singleitem/collection/p4013coll8/id/2425/rec/4'}, {'Tentative troop list by type units for Blacklist operations.': '/cdm/singleitem/collection/p4013coll8/id/150/rec/5'}, {'Chemical Warfare Service: history of training, part 2, schooling of commissioned officers.': '/cdm/compoundobject/collection/p4013coll8/id/2501/rec/6'}, {'Horses in the German Army (1941-1945).': '/cdm/compoundobject/collection/p4013coll8/id/2495/rec/7'}, {'Unit history: 38 (MECZ) cavalry rcn. sq.': '/cdm/singleitem/collection/p4013coll8/id/3672/rec/8'}, {'Operations in France: December 1944, 714th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3407/rec/9'}, {'G-3 Reports : Third Infantry Division. (22 Jan- 30 Mar 44)': '/cdm/singleitem/collection/p4013coll8/id/4393/rec/10'}, {'Summary of operations, 1 July thru 31 July 1944.': '/cdm/singleitem/collection/p4013coll8/id/3445/rec/11'}, {'After action report 36th Armored Infantry Regiment, 3rd Armored Division, Nov 1944 thru April 1945.': '/cdm/singleitem/collection/p4013coll8/id/3668/rec/12'}, {'Unit history, 38th Mechanized Cavalry Reconnaissance Squadron, 9604 thru 9665.': '/cdm/singleitem/collection/p4013coll8/id/3703/rec/13'}, {'Redeployment: occupation forces in Europe series, 1945-1946.': '/cdm/singleitem/collection/p4013coll8/id/2952/rec/14'}, {'Twelfth US Army group directives. Annex no. 1.': '/cdm/singleitem/collection/p4013coll8/id/2898/rec/15'}, {'After action report, 749th Tank Battalion: Jan, Feb, Apr - 8 May 45.': '/cdm/singleitem/collection/p4013coll8/id/3502/rec/16'}, {'743rd Tank Battalion, S3 journal history.': '/cdm/singleitem/collection/p4013coll8/id/3553/rec/17'}, {'History of military training, WAAC / WAC training.': '/cdm/singleitem/collection/p4013coll8/id/4052/rec/18'}, {'After action report, 756th Tank Battalion.': '/cdm/singleitem/collection/p4013coll8/id/3440/rec/19'}, {'After action report 92nd Cavalry Recon Squadron Mechanized 12th Armored Division, Jan thru May 45.': '/cdm/singleitem/collection/p4013coll8/id/3583/rec/20'}]
An easy way to do it with requests and BeautifulSoup:
import requests
from bs4 import BeautifulSoup
req = requests.get(url) # url stands for the page's url you want to find
soup = BeautifulSoup(req.text, "html.parser") # req.text is the complete html of the page
print(soup.title.string) # soup.title will give you the title of the page but with the <title> tags so .string removes them
Try this.
from simplified_scrapy import SimplifiedDoc,req,utils
url = 'http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/1'
html = req.get(url)
doc = SimplifiedDoc(html)
lst = doc.selects('div.marginTopTextAdjuster').select('a')
titles_link = [(utils.absoluteUrl(url,a.href),a.text) for a in lst if a]
print (titles_link)
Result:
[('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1', 'Civil Affairs Handbook, Japan, section 1a: population statistics.'), ('http://cgsc.cdmhost.com/cdm/landingpage/collection/p4013coll8', 'World War II Operational Documents'), ('http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2385/rec/2', 'Army Air Forces Program 1943.'),...

Scraping through on Wiki using "tr" and "td" with BeautifulSoup and python

Total python3 beginner here. I can't seem to get just the name of of the colleges to print out.
the class is no where near the college names and i can't seem to narrow the find_all down to what i need. and print to a new csv file. Any ideas?
import requests
from bs4 import BeautifulSoup
import csv
res= requests.get("https://en.wikipedia.org/wiki/Ivy_League")
soup = BeautifulSoup(res.text, "html.parser")
colleges = soup.find_all("table", class_ = "wikitable sortable")
for college in colleges:
first_level = college.find_all("tr")
print(first_level)
You can use soup.select() to utilize css selectors and be more precise:
import requests
from bs4 import BeautifulSoup
res= requests.get("https://en.wikipedia.org/wiki/Ivy_League")
soup = BeautifulSoup(res.text, "html.parser")
l = soup.select(".mw-parser-output > table:nth-of-type(2) > tbody > tr > td:nth-of-type(1) a")
for each in l:
print(each.text)
Printed result:
Brown University
Columbia University
Cornell University
Dartmouth College
Harvard University
University of Pennsylvania
Princeton University
Yale University
To put a single column into csv:
import pandas as pd
pd.DataFrame([e.text for e in l]).to_csv("your_csv.csv") # This will include index
With:
colleges = soup.find_all("table", class_ = "wikitable sortable")
you are getting all the tables with this class (there are five), not getting all the colleges in the table. So you can do something like this:
import requests
from bs4 import BeautifulSoup
res= requests.get("https://en.wikipedia.org/wiki/Ivy_League")
soup = BeautifulSoup(res.text, "html.parser")
college_table = soup.find("table", class_ = "wikitable sortable")
colleges = college_table.find_all("tr")
for college in colleges:
college_row = college.find('td')
college_link = college.find('a')
if college_link != None:
college_name = college_link.text
print(college_name)
EDIT: I added an if to discard the first line, that has the table header

Review scraping form tripadvisor

I am new to web scraping in python3. I want to scrape the reviews of all the hotels in dubai but the problem is I can only scrape the hotel review which I describe in the url. Can anyone show me how I can get all of the hotel reviews without implicitly giving url of each hotel?
import requests
from bs4 import BeautifulSoup
importurl = 'https://www.tripadvisor.com/Hotel_Review-g295424-d302778-Reviews-Roda_Al_Bustan_Dubai_Airport-Dubai_Emirate_of_Dubai.html'
r = requests.get(importurl)
soup = BeautifulSoup(r.content, "lxml")
resultsoup = soup.find_all("p", {"class" : "partial_entry"})
#save the reviews to a test text file locally
for review in resultsoup:
review_list = review.get_text()
print(review_list)
with open('testreview.txt', 'w') as fid:
for review in resultsoup:
review_list = review.get_text()
fid.write(review_list)
you should find the index page of all hotel, get all the link into a list, than loop the url list to get comment.
import bs4, requests
index_pages = ('http://www.tripadvisor.cn/Hotels-g295424-oa{}-Dubai_Emirate_of_Dubai-Hotels.html#ACCOM_OVERVIEW'.format(i) for i in range(0, 540, 30))
urls = []
with requests.session() as s:
for index in index_pages:
r = s.get(index)
soup = bs4.BeautifulSoup(r.text, 'lxml')
url_list = [i.get('href') for i in soup.select('.property_title')]
urls.append(url_list)
out:
len(urls): 540

Categories

Resources