I am trying to make web scraper using Python and the basic concept I am using here is,
create empty list --> use 'for loop' to loop through the element on the web page. --> append that info in the empty list --> convert that list to row and column using pandas --> finally to a csv.
the code that I made is
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
headers = {"Accept-Language": "en-US, en;q=0.5"}
url = "https://www.imdb.com/find?q=top+1000+movies&ref_=nv_sr_sm"
results=requests.get(url,headers=headers)
soup=BeautifulSoup(results.text,"html.parser")
# print(soup.prettify())
#initializing empty lists where the data will go
titles =[]
years = []
times = []
imdb_rating = []
metascores = []
votes = []
us_gross = []
movie_div = soup.find_all('div',class_='lister-list')
#initiating the loop for scraper
for container in movie_div:
#tiles
name=container.tr.td.a.text
titles.append(name)
print(titles)
the website I want to scrap is 'https://www.imdb.com/chart/top/?ref_=nv_mv_250'. I need help to know how can i give correct path to the variable 'name', so that i can extract the name of the movie given in name_of_movei, in the HTML script of the page. Because each time I am getting output as empty list.
This example will parse name, year, rating from the table and creates a dataframe from it:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = "https://www.imdb.com/chart/top/"
headers = {"Accept-Language": "en-US, en;q=0.5"}
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
all_data = []
for row in soup.select(".lister-list > tr"):
name = row.select_one(".titleColumn a").text.strip()
year = row.select_one(".titleColumn .secondaryInfo").text.strip()
rating = row.select_one(".imdbRating").text.strip()
# ...other variables
all_data.append([name, year, rating])
df = pd.DataFrame(all_data, columns=["Name", "Year", "Rating"])
print(df.head().to_markdown(index=False))
Prints:
Name
Year
Rating
The Shawshank Redemption
(1994)
9.2
The Godfather
(1972)
9.2
The Dark Knight
(2008)
9
The Godfather: Part II
(1974)
9
12 Angry Men
(1957)
8.9
I wrote this code to extract multiple pages of data from this site (base URL - "https://www.goodreads.com/shelf/show/fiction").
import requests
from bs4 import BeautifulSoup
import pandas as pd
page = 1
book_title = []
while page != 5:
url = 'https://www.goodreads.com/shelf/show/fiction?page={page}'
response = requests.get(url)
page_content = response.text
doc = BeautifulSoup(page_content, 'html.parser')
a_tags = doc.find_all('a', {'class': 'bookTitle'})
for tag in a_tags:
book_title.append(tag.text)
page = page + 1
But it's only showing the first 50 books' data. How can I extract all fiction books' names extracting all pages using beautifulsoup?
You can make the pagination from fiction category of the books from your base base url, you need to input the fiction keyword in search box and click on search button then you will get this url :https://www.goodreads.com/search?q=fiction&qid=ydDLZMCwDJ and from here you have to collect data and to make the next pages.
import requests
from bs4 import BeautifulSoup
import pandas as pd
book_title = []
url = 'https://www.goodreads.com/search?page={page}&q=fiction&qid=ydDLZMCwDJ&tab=books'
for page in range(1,11):
response = requests.get(url.format(page=page))
page_content = response.text
doc = BeautifulSoup(page_content, 'html.parser')
a_tags = doc.find_all('a', {'class': 'bookTitle'})
for tag in a_tags:
book_title.append(tag.get_text(strip=True))
df = pd.DataFrame(book_title,columns=['Title'])
print(df)
Output:
Title
0 Trigger Warning: Short Fictions and Disturbances
1 You Are Not So Smart: Why You Have Too Many Fr...
2 Smoke and Mirrors: Short Fiction and Illusions
3 Fragile Things: Short Fictions and Wonders
4 Collected Fictions
.. ...
195 The Science Fiction Hall of Fame, Volume One, ...
196 The Art of Fiction: Notes on Craft for Young W...
197 Invisible Planets: Contemporary Chinese Scienc...
198 How Fiction Works
199 Monster, She Wrote: The Women Who Pioneered Ho...
[200 rows x 1 columns]
I am scraping a website table form https://csr.gov.in/companyprofile.php?year=FY+2015-16&CIN=L00000CH1990PLC010573 but I am not getting the exact result I am looking for. I want 11 columns from this link, "company name", "Class", "State", "Company Type", "RoC", "Sub Category", "Listing Status". These are 7 columns and after that you can see an expand button " CSR Details of FY 2017-18" when you will click on that button you will get 4 more columns "Average Net Profit", "CSR Prescribed Expenditure", "CSR Spent", "Local Area Spent". I want all these columns in csv file. I wrote a code and it is not working properly. I am attaching an Image of result for refference. and here is my code. please help to get these data.
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
driver = webdriver.Chrome()
url_file = "csrdata.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
data = []
# now we run a for loop to visit the urls one by one
for single_page in pages:
driver.get(single_page)
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
driver.find_element_by_link_text("CSR Details of FY 2017-18").click()
table = driver.find_elements_by_xpath("//*[contains(#id,'colfy4')]")
about = table.__getitem__(0).text
x = about.split('\n')
print(x)
data.append(x)
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('csr.csv')
You dont need to use selenium since all the informations are inside the html code. Also you can use pandas inbuild function pd_read_html() to directly transform the html-table into a dataframe.
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') #finds all tables
table_top = pd.read_html(str(table))[0] #the top table
try: #try to get the other table if exists
table_extra = pd.read_html(str(table))[7]
except:
table_extra = pd.DataFrame()
result = pd.concat([table_top, table_extra])
data.append(result)
pd.concat(data).to_csv('test.csv')
output:
0 1
0 Class Public
1 State Chandigarh
2 Company Type Other than Govt.
3 RoC RoC-Chandigarh
4 Sub Category Company limited by shares
5 Listing Status Listed
0 Average Net Profit 0
1 CSR Prescribed Expenditure 0
2 CSR Spent 0
3 Local Area Spent 0
I am trying to get (some) contents of a table from an URL.
So far I have managed to get two desired contents of the page, but there is a third one (third column) that I would like to get only its text. The problem is, the underlying link exists elsewhere on the page (with different text) and if I want to load the table into an SQL database, the contents of the third colum won't match the first two columns.
import urllib2
from bs4 import BeautifulSoup4
startURL = "http://some.url/website.html"
page = urllib2.urlopen(startURL).read()
soup = BeautifulSoup(page, "html.parser")
for links in soup.findAll("a"):
if "href" in links.attrs:
www = links.attrs.values()
if not "https://" in www[0]: # to exclude all non-relative links, e.g. external links
if "view/" in www[0]: # To get only my desired links of column 1
link_of_column1 = www[0] # this is now my wanted link
Okay, so with this code I can get the second column. Where and how would I have to apply the .nextsibling() function, to get the next link in the next (3rd) column?
Edit:
As I have been asked: The URL is https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html and I want to get the contents from Column 2 and 3, which is "Hosting Company" (link-text and link) and "Country" (only the text).
Edit2:
Another thing I forgot...how can I extract the information that its 137,157 records?
First find the table which has all the info using its id=web_hosting_tbl attribute. Then iterate over all the rows of the table. But, if you look at the page source, the rows you need are not consecutive, but, alternate, and they don't have any class names. Also, the first row of the table is the header row, so we've to skip that.
After getting the required rows (using table.find_all('tr')[1::2]), find all the columns and then get the required information from the corresponding columns.
Code:
import requests
from bs4 import BeautifulSoup
r = requests.get('https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html')
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table', id='web_hosting_tbl')
for row in table.find_all('tr')[1::2]:
all_columns = row.find_all('td')
name = all_columns[1].a.text
link = all_columns[1].a['href']
country = all_columns[2].a.text
print(name, link, country, sep=' | ')
Partial output:
Godaddy.com, LLC | /view/web_hosting/2433/Godaddy_com_LLC.html | USA
Cloudflare, Inc | /view/web_hosting/4638/Cloudflare_Inc.html | USA
Amazon.com, Inc | /view/web_hosting/615/Amazon_com_Inc.html | USA
Ovh Sas | /view/web_hosting/7593/Ovh_Sas.html | France
Hetzner Online Ag | /view/web_hosting/45081/Hetzner_Online_Ag.html | Germany
Hostgator.com Llc | /view/web_hosting/26757/Hostgator_com_Llc.html | USA
Google Inc | /view/web_hosting/617/Google_Inc.html | USA
Bluehost Inc | /view/web_hosting/3886/Bluehost_Inc.html | USA
...
Code: (Python 3.6+, used f-strings)
import urllib.parse
from collections import namedtuple
from datetime import datetime
import bs4
import requests
HostingCompany = namedtuple('HostingCompany',
('name', 'country', 'websites', 'usage', 'usage_by_top', 'update_time'))
class MyIpLink:
url_base = 'https://myip.ms'
def __init__(self, tag: bs4.element.Tag, *, is_anchor=False):
a_tag = tag.find('a')
if is_anchor: # treat `tag` as an anchor tag
a_tag = tag
self.text = tag.text.strip()
self.url = urllib.parse.urljoin(self.url_base, a_tag['href'])
def __repr__(self):
return f'{self.__class__.__name__}(text={repr(self.text)}, url={repr(self.url)})'
url = 'https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html'
html = requests.get(url).text
soup = bs4.BeautifulSoup(html, 'html.parser')
rows = soup.select('#web_hosting_tbl > tbody > tr')[::2] # skips "more info" rows
companies = []
for row in rows:
tds = row.find_all('td')
name = MyIpLink(tds[1])
country = MyIpLink(tds[2])
websites = [MyIpLink(a, is_anchor=True) for a in tds[3].find_all('a')]
usage = MyIpLink(tds[4])
usage_by_top = MyIpLink(tds[5])
update_time = datetime.strptime(tds[6].text.strip(), '%d %b %Y, %H:%M')
company = HostingCompany(name, country, websites, usage, usage_by_top, update_time)
companies.append(company)
import pprint
pprint.pprint(companies)
print(companies[0].name.text)
print(companies[0].name.url)
print(companies[0].country.text)
Output:
[HostingCompany(name=MyIpLink(text='Godaddy.com, LLC', url='https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.godaddy.com', url='https://myip.ms/go.php?1229687315_ITg7Im93dCkWE0kNAhQSEh0FUeHq5Q==')], usage=MyIpLink(text='512,701 sites', url='https://myip.ms/browse/sites/1/ownerID/2433/ownerIDii/2433'), usage_by_top=MyIpLink(text='951 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/2433/ownerIDii/2433'), update_time=datetime.datetime(2018, 5, 2, 5, 17)),
HostingCompany(name=MyIpLink(text='Cloudflare, Inc', url='https://myip.ms/view/web_hosting/4638/Cloudflare_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.cloudflare.com', url='https://myip.ms/go.php?840626136_OiEsK2ROSxAdGl4QGhYJG+Tp6fnrv/f49w==')], usage=MyIpLink(text='488,119 sites', url='https://myip.ms/browse/sites/1/ownerID/4638/ownerIDii/4638'), usage_by_top=MyIpLink(text='16,160 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/4638/ownerIDii/4638'), update_time=datetime.datetime(2018, 5, 2, 5, 10)),
HostingCompany(name=MyIpLink(text='Amazon.com, Inc', url='https://myip.ms/view/web_hosting/615/Amazon_com_Inc.html'), country=MyIpLink(text='USA', url='https://myip.ms/view/best_hosting/USA/Best_Hosting_in_USA.html'), websites=[MyIpLink(text='www.amazonaws.com', url='https://myip.ms/go.php?990446041_JyYhKGFxThMQHUMRHhcDExHj8vul7f75')], usage=MyIpLink(text='453,230 sites', url='https://myip.ms/browse/sites/1/ownerID/615/ownerIDii/615'), usage_by_top=MyIpLink(text='9,557 sites', url='https://myip.ms/browse/sites/1/rankii/100000/ownerID/615/ownerIDii/615'), update_time=datetime.datetime(2018, 5, 2, 5, 4)),
...
]
Godaddy.com, LLC
https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html
USA
Gonna update the answer with some explanation in the evening. Cheers!
Try the below approach. It should give you the texts from column 2, the links from column 2 and again the texts from column 3 out of that table. I used lxml instead of BeautifulSoup to make it faster. Thanks.
import requests
from urllib.parse import urljoin
from lxml.html import fromstring
URL = 'https://myip.ms/browse/web_hosting/World_Web_Hosting_Global_Statistics.html'
res = requests.get(URL)
root = fromstring(res.text)
for items in root.cssselect('#web_hosting_tbl tr:not(.expand-child)')[1:]:
name = items.cssselect("td.row_name a")[0].text
link = urljoin(URL,items.cssselect("td.row_name a")[0].attrib['href'])
country = items.cssselect("td a[href^='/view/best_hosting/']")[0].text
print(name, link, country)
Results:
Godaddy.com, LLC https://myip.ms/view/web_hosting/2433/Godaddy_com_LLC.html USA
Cloudflare, Inc https://myip.ms/view/web_hosting/4638/Cloudflare_Inc.html USA
Amazon.com, Inc https://myip.ms/view/web_hosting/615/Amazon_com_Inc.html USA
Ovh Sas https://myip.ms/view/web_hosting/7593/Ovh_Sas.html France
Hetzner Online Ag https://myip.ms/view/web_hosting/45081/Hetzner_Online_Ag.html Germany
Hostgator.com Llc https://myip.ms/view/web_hosting/26757/Hostgator_com_Llc.html USA
Google Inc https://myip.ms/view/web_hosting/617/Google_Inc.html USA
I am currently trying to pull data from TripAdvisor Restaurant from different countries. The fields I am trying to pull is name, address, and cuisine type (chinese, steakhouse, etc.). I have successfully been able to pull name and address using my script; however, pulling cuisine type is proving pretty difficult for myself. If you take a look below, you'll find screenshots of what I am trying to pull from TripAdvisor, and my code.
What I want to pull from TripAdvisor is circled in red.
When I print my code it keeps printing 'Asian' even thought the second one should be a 'Steakhouse'.
#import libraries
import requests
from bs4 import BeautifulSoup
import csv
#loop to move into the next pages. entries are in increments of 30 per page
for i in range(0, 120, 30):
#need this here for when you want more than 30 entries pulled
while i <= range:
i = str(i)
#url format offsets the restaurants in increments of 30 after the oa
url1 = 'https://www.tripadvisor.com/Restaurants-g294217-oa' + i + '-Hong_Kong.html#EATERY_LIST_CONTENTS'
r1 = requests.get(url1)
data1 = r1.text
soup1 = BeautifulSoup(data1, "html.parser")
for link in soup1.findAll('a', {'property_title'}):
#print 'https://www.tripadvisor.com/Restaurant_Review-g294217-' + link.get('href')
restaurant_url = 'https://www.tripadvisor.com/Restaurant_Review-g294217-' + link.get('href')
#print link.string
account_name = link.string.strip()
#cuisine type pull
for link in soup1.findAll('a', {'cuisine'}):
cuisinetype = link.string.strip()
r_address = requests.get(restaurant_url)
r_addresstext = r_address.text
soup2 = BeautifulSoup(r_addresstext, "html.parser")
for restaurant_url in soup2.findAll('span', {'street-address'})[0]:
#print(restaurant_url.string)
rest_address = restaurant_url.string
rest_array = [account_name, rest_address, cuisinetype]
print rest_array
#with open('ListingsPull-HongKong.csv', 'a') as file:
#writer = csv.writer(file)
#writer.writerow([account_name, rest_address])
break
This approach is not especially elegant but might be acceptable to you. I notice that the information you want seems to be repeated under the 'Details' tab for 'Cuisine'. I've found it easier to access there this way.
>>> import requests
>>> from bs4 import BeautifulSoup
>>> restaurant_url='https://www.tripadvisor.ca/Restaurant_Review-g294217-d2399904-Reviews-Tin_Lung_Heen-Hong_Kong.html'
>>> soup2 = BeautifulSoup(requests.get(restaurant_url).text, "html.parser")
>>> street_address=soup2.find('span',{'street-address'})
>>> street_address
<span class="street-address" property="streetAddress">International Commerce Centre, 1 Austin Road West, Kowloon</span>
>>> street_address.contents[0]
'International Commerce Centre, 1 Austin Road West, Kowloon'
>>> for item in soup2.findAll('div', attrs={'class', 'title'}):
... if 'Cuisine' in item.text:
...
... item.text.strip()
... break
...
'Cuisine'
>>> content=item.findNext('div', attrs={'class', 'content'})
>>> content
<div class="content">
Chinese, Asian
</div>
>>> content.text
'\nChinese,\xa0Asian\n'
>>> content.text.strip().split('\xa0')
['Chinese,', 'Asian']