How to scrape embedded integers on a website - python

I'm trying to scrape the number of likes for the datasets available on this website.
I've been unable to workout a way of reliably identifying and scraping the relationship between the dataset title and the like integer:
as it is embedded in the HTML as below:
I have used a scraper previously to get information about the resource urls. In that case I was able to capture the last child a of parent h3 with a parent having class .dataset-item.
I would like to adapt my existing code to scrape the number of likes for each resource in the catalogue, rather than the URLs. Below is the code for the url scraper I used:
from bs4 import BeautifulSoup as bs
import requests
import csv
from urllib.parse import urlparse
json_api_links = []
data_sets = []
def get_links(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
base = '{uri.scheme}://{uri.netloc}'.format(uri=urlparse(url))
links = [base + item['href'] if item['href'][0] == '/' else item['href'] for item in soup.select(css_selector)]
return links
results = []
#debug = []
with requests.Session() as s:
for page in range(1,2): #set number of pages
links = get_links(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-item h3 a:last-child')
for link in links:
data = get_links(s, link, '[href*="/api/3/action/package_show?id="]')
json_api_links.append(data)
#debug.append((link, data))
resources = list(set([item.replace('opendata','') for sublist in json_api_links for item in sublist])) #can just leave as set
for link in resources:
try:
r = s.get(link).json() #entire package info
data_sets.append(r)
title = r['result']['title'] #certain items
if 'resources' in r['result']:
urls = ' , '.join([item['url'] for item in r['result']['resources']])
else:
urls = 'N/A'
except:
title = 'N/A'
urls = 'N/A'
results.append((title, urls))
with open('data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Resource Url'])
for row in results:
w.writerow(row)
My desired output would appear like this:

The approach is pretty straight forward. Your given website contains required elements in a list Tag. And what you need to do, is to get source code of that <li> tag, and just fetch Heading, which has a certain class and Same goes for like count.
The catch in like count is, the text comprises of some noise. To fix that, you can use regular expression to extract digits ('\d+') from given input of likes count. Following code gives desired result:
from bs4 import BeautifulSoup as soup
import requests
import re
import pandas as pd
source = requests.get('https://data.nsw.gov.au/data/dataset')
sp = soup(source.text,'lxml')
element = sp.find_all('li',{'class':"dataset-item"})
heading = []
likeList = []
for i in element:
try:
header = i.find('a',{'class':"searchpartnership-url-analytics"})
heading.append(header.text)
except:
header = i.find('a')
heading.append(header.text)
like = i.find('span',{'id':'likes-count'})
likeList.append(re.findall('\d+',like.text)[0])
dict = {'Title': heading, 'Likes': likeList}
df = pd.DataFrame(dict,index=False)
print(df)
Hope it helped!

You could use the following.
I am using a css selector with Or syntax to retrieve title and likes as one list (as every publication has both). I then use slicing to separate titles from likes.
from bs4 import BeautifulSoup as bs
import requests
import csv
def get_titles_and_likes(s, url, css_selector):
r = s.get(url)
soup = bs(r.content, 'lxml')
info = [item.text.strip() for item in soup.select(css_selector)]
titles = info[::2]
likes = info[1::2]
return list(zip(titles,likes))
results = []
with requests.Session() as s:
for page in range(1,10): #set number of pages
data = get_titles_and_likes(s, 'https://data.nsw.gov.au/data/dataset?page={}'.format(page), '.dataset-heading .searchpartnership-url-analytics, .dataset-heading [href*="/data/dataset"], .dataset-item #likes-count')
results.append(data)
results = [i for item in results for i in item]
with open(r'data.csv','w', newline='') as f:
w = csv.writer(f)
w.writerow(['Title','Likes'])
for row in results:
w.writerow(row)

Related

Scraping a website with multiple pages and not getting the desired amount of output with BeautifulSoup

I am trying to scrape a website with multiple pages(50) and get specific information but when i run my code, my output is just 7 items when there are over 20000 on the website and I found out that my code is Scraping just the first page. Please I don't know what else to do, I'd appreciate your help. Thank you
import requests
from bs4 import BeautifulSoup
import pandas as pd
name_selector = ".name"
old_price_selector = ".old"
new_price_selector = ".prc"
for i in range(1,50,1):
url = "https://www.jumia.com.ng/phones-tablets/samsung/?q=samsung+phones&page=" +str(i)+ "#catalog-listing"
website = requests.get(url)
soup = BeautifulSoup(website.content, 'html.parser')
name = soup.select(name_selector)
old_price = soup.select(old_price_selector)
new_price = soup.select(new_price_selector)
discount = soup.findAll("div", {"class": "bdg _dsct _sm"})
data = []
for names, old_prices, new_prices, discounts in zip(name, old_price, new_price, discount):
dic = {"Phone Names": names.getText(),"New Prices": old_prices.getText(),"Old Prices": new_prices.getText(),"Discounts": discounts.getText()}
data.append(dic)
df = pd.DataFrame(data)
You have to create data = [] before first loop. That's all.
data = []
for i in range(1, 50):
# ... code ...
Your code creates new data = [] in every loop and it removes previous content - so you get data only from last page.

Attempting to retrieve text from <td></td> tags using BeautifulSoup

So I'm using BeautifulSoup to scrape the link in the code. The artist names and the links come out fine, but I'm not sure how to access the nationality in that second tag.
Here's the code:
import requests
import csv
from bs4 import BeautifulSoup
def findName():
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anB1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
last_links = soup.find(class_='AlphaNav')
last_links.decompose()
f = csv.writer(open('h-artist_lastname.csv', 'w')) # Create a file to write
f.writerow(['Last Name, First Name', 'Nationality', 'Link'])
artist_name_list = soup.find(class_='BodyText')
artist_name_list_items = artist_name_list.find_all('a')
artist_nationality_list_items = artist_name_list.find_all('td')
print(artist_nationality_list_items)
for artist_name in artist_name_list_items:
names = artist_name.contents[0]
#nationalities = artist_nationality_list_items.contents[0]
links = 'https://web.archive.org' + artist_name.get('href')
#print(nationalities)
f.writerow([names, links])
findName()
If I uncomment the line in the for loop, I get a runtime error which I expect. The print statement gives me this value for artist_nationality_list_items:
<td>Babbitt, Platt D.</td>, <td>American, died 1879</td>, ..... <- follows this pattern for every artist
Basically, I want the part with 'American, died 1879'.
You can use select which accepts CSS selectors with :nth-child() to select second <td> in each <tr> instead of find_all, so this:
artist_nationality_list_items = artist_name_list.find_all('td')
becomes:
artist_nationality_list_items = artist_name_list.select('td:nth-child(2)')
You can still work with contents, but don't get bogged down with all the lists - Select your target more specific and get all information with more flow.
What happens?
You're treating artist_nationality_list_items (a list) like a single element, that wont work.
How to fix?
To get the right result from your artist_nationality_list_items you have to iterate it too.
(Works, but bad idea):
for i,artist_name in enumerate(artist_name_list_items):
names = artist_name.contents[0]
nationalities = artist_nationality_list_items[i+1].contents[0]
links = 'https://web.archive.org' + artist_name.get('href')
Alternativ and much leaner approach
import requests, csv
from bs4 import BeautifulSoup
def findName():
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anB1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
f = csv.writer(open('h-artist_lastname.csv', 'w')) # Create a file to write
f.writerow(['Last Name, First Name', 'Nationality', 'Link'])
for row in soup.select('div.BodyText h3+table tr'):
names = row.contents[0].text
nationalities = row.contents[1].text
links = 'https://web.archive.org' + row.a.get('href')
#print([names,nationalities,links])
f.writerow([names,nationalities,links])
findName()
A little bit of a botched answer with some sloppy workarounds, but this resulted in what I needed:
import requests
import csv
from bs4 import BeautifulSoup
def findName():
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anB1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
last_links = soup.find(class_='AlphaNav')
last_links.decompose()
f = csv.writer(open('b-artist_lastname.csv', 'w')) # Create a file to write
f.writerow(['Last Name, First Name', 'Nationality', 'Link'])
artist_name_list = soup.find(class_='BodyText')
artist_name_list_items = artist_name_list.find_all('a')
i = 2
for artist_name in artist_name_list_items:
str_list = list('td:nth-of-type(i)')
str_list[15] = str(i)
selection = "".join(str_list)
names = artist_name.contents[0]
nationality = artist_name_list.select(selection)
links = 'https://web.archive.org' + artist_name.get('href')
nat_to_str = str(nationality)
nat_str_final = nat_to_str[5:len(nat_to_str) - 6]
#print(nat_str_final)
f.writerow([names, nat_str_final, links])
i += 2
findName()
Thank you to everyone who answered. Using 'td:nth-of-type()' seemed to work but for me to get every artist on the page, I would need to increase the value inside of nth-of-type every time so I used a list of chars and converted them into a string after incrementing I at each traversal.

Python Web Scraping - How to scrape this type of site?

Okay, so I need to scrape the following webpage: https://www.programmableweb.com/category/all/apis?deadpool=1
It's a list of APIs. There are approx 22,000 APIs to scrape.
I need to:
1) Get the URL of each API in the table (pages 1-889), and also to scrape the following info:
API name
Description
Category
Submitted
2) I then need to scrape a bunch of information from each URL.
3) Export the data to a CSV
The thing is, I’m a bit lost of how to think about this project. From what I can see, there are no AJAX calls been made to populate the table, which means I’m going to have to parse the HTML directly (right?)
In my head, the logic would be something like this:
Use the requests & BS4 libraries to scrape the table
Then, somehow grab the HREF from every row
Access that HREF, scrape the data, move onto the next one
Rinse and repeat for all table rows.
Am I on the right track, is this possible with requests & BS4?
Here's are some screenshots of what I've been trying to explain.
Thank you SOO much for any help. This is hurting my head haha
Here we go using requests, BeautifulSoup and pandas:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.programmableweb.com/category/all/apis?deadpool=1&page='
num = int(input('How Many Page to Parse?> '))
print('please wait....')
name = []
desc = []
cat = []
sub = []
for i in range(0, num):
r = requests.get(f"{url}{i}")
soup = BeautifulSoup(r.text, 'html.parser')
for item1 in soup.findAll('td', attrs={'class': 'views-field views-field-title col-md-3'}):
name.append(item1.text)
for item2 in soup.findAll('td', attrs={'class': 'views-field views-field-search-api-excerpt views-field-field-api-description hidden-xs visible-md visible-sm col-md-8'}):
desc.append(item2.text)
for item3 in soup.findAll('td', attrs={'class': 'views-field views-field-field-article-primary-category'}):
cat.append(item3.text)
for item4 in soup.findAll('td', attrs={'class': 'views-field views-field-created'}):
sub.append(item4.text)
result = []
for item in zip(name, desc, cat, sub):
result.append(item)
df = pd.DataFrame(
result, columns=['API Name', 'Description', 'Category', 'Submitted'])
df.to_csv('output.csv')
print('Task Completed, Result saved to output.csv file.')
Result can be viewed online: Check Here
Output Simple:
Now For href parsing:
import requests
from bs4 import BeautifulSoup
import pandas as pd
url = 'https://www.programmableweb.com/category/all/apis?deadpool=0&page='
num = int(input('How Many Page to Parse?> '))
print('please wait....')
links = []
for i in range(0, num):
r = requests.get(f"{url}{i}")
soup = BeautifulSoup(r.text, 'html.parser')
for link in soup.findAll('td', attrs={'class': 'views-field views-field-title col-md-3'}):
for href in link.findAll('a'):
result = 'https://www.programmableweb.com'+href.get('href')
links.append(result)
spans = []
for link in links:
r = requests.get(link)
soup = soup = BeautifulSoup(r.text, 'html.parser')
span = [span.text for span in soup.select('div.field span')]
spans.append(span)
data = []
for item in spans:
data.append(item)
df = pd.DataFrame(data)
df.to_csv('data.csv')
print('Task Completed, Result saved to data.csv file.')
Check Result Online: Here
Sample View is Below:
In Case if you want those 2 csv files together so here's the code:
import pandas as pd
a = pd.read_csv("output.csv")
b = pd.read_csv("data.csv")
merged = a.merge(b)
merged.to_csv("final.csv", index=False)
Online Result: Here
You should read more about scraping if you are going to pursue it .
from bs4 import BeautifulSoup
import csv , os , requests
from urllib import parse
def SaveAsCsv(list_of_rows):
try:
with open('data.csv', mode='a', newline='', encoding='utf-8') as outfile:
csv.writer(outfile).writerow(list_of_rows)
except PermissionError:
print("Please make sure data.csv is closed\n")
if os.path.isfile('data.csv') and os.access('data.csv', os.R_OK):
print("File data.csv Already exists \n")
else:
SaveAsCsv([ 'api_name','api_link','api_desc','api_cat'])
BaseUrl = 'https://www.programmableweb.com/category/all/apis?deadpool=1&page={}'
for i in range(1, 890):
print('## Getting Page {} out of 889'.format(i))
url = BaseUrl.format(i)
res = requests.get(url)
soup = BeautifulSoup(res.text,'html.parser')
table_rows = soup.select('div.view-content > table[class="views-table cols-4 table"] > tbody tr')
for row in table_rows:
tds = row.select('td')
api_name = tds[0].text.strip()
api_link = parse.urljoin(url, tds[0].find('a').get('href'))
api_desc = tds[1].text.strip()
api_cat = tds[2].text.strip() if len(tds) >= 3 else ''
SaveAsCsv([api_name,api_link,api_desc,api_cat])

Unable to print once to get all the data altogether

I've written a script in python to scrape the tablular content from a webpage. In the first column of the main table there are the names. Some names have links to lead another page, some are just the names without any link. My intention is to parse the rows when a name has no link to another page. However, when the name has link to another page then the script will first parse the concerning rows from the main table and then follow that link to parse associated information of that name from the table located at the bottom under the title Companies. Finally, write them in a csv file.
site link
I've tried so far:
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
first_table = [i.text for i in item.select("td")]
print(first_table)
else:
first_table = [i.text for i in item.select("td")]
print(first_table)
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info = [elem.text for elem in elems.select("td")]
print(associated_info)
My above script can do almost everything but I can't create any logic to print once rather than printing thrice to get all the data atltogether so that I can write them in a csv file.
Put all your scraped data into a list, here I've called the list associated_info then all the data is in one place & you can iterate over the list to print it out to a CSV if you like...
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
link = "https://suite.endole.co.uk/insight/company/ajax_people.php?ajax_url=ajax_people&page=1&company_number=03512889"
base = "https://suite.endole.co.uk"
res = requests.get(link)
soup = BeautifulSoup(res.text,"lxml")
associated_info = []
for item in soup.select("table tr")[1:]:
if not item.select_one("td a[href]"):
associated_info.append([i.text for i in item.select("td")])
else:
associated_info.append([i.text for i in item.select("td")])
url = urljoin(base,item.select_one("td a[href]").get("href"))
resp = requests.get(url)
soup_ano = BeautifulSoup(resp.text,"lxml")
for elems in soup_ano.select(".content:contains(Companies) table tr")[1:]:
associated_info.append([elem.text for elem in elems.select("td")])
print(associated_info)

how to loop using beautifulsoup

I am trying to scrape data on car model, price, mileage, location, etc using beautifulsoup. However, the return result only reports data on one random car. I want to be able to collect data on all cars advertised on the site to date. My python code is below. How can I modify my code to retrieve data such that each day I have information on car model, price, mileage, location, etc? Example:
Car model price mileage location date
Toyota Corrola $4500 22km Accra 16/02/2018
Nissan Almera $9500 60km Tema 16/02/2018
etc
import requests
from bs4 import BeautifulSoup
import pandas
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
print soup.prettify()
data = soup.find(class_='item-content')
for tag in data:
item_title = data.find("a",attrs={"class":"item-title h4"})
model = item_title.text.encode('utf-8').strip()
item_meta = data.find("p",attrs={"class":"item-meta"})
mileage = item_meta.text.encode('utf-8').strip()
item_location = data.find("p",attrs={"class":"item-location"})
location = item_location.text.encode('utf-8').strip()
item_info = data.find("p",attrs={"class":"item-info"})
price = item_info.text.encode('utf-8').strip()
with open('example.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, mileage, location, datetime.now()])
First off, this loop:
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
is not doing what I assume you think it is. This loop simply resets the url 300 times and leaves you with the original url you set. You need to wrap all your code in this loop to ensure you are hitting each of the URLs you want (1-300).
Restructure your code (paying attention to indents!) so that the next url is the one being used in the request:
# This will print ALOT of titles
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?" + str(i)
print(url) # Notice how the url changes with each iteration?
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
currTitle = item.text.encode('utf-8').strip()
print(currTitle)
This code:
import requests
from bs4 import BeautifulSoup as bsoup
url = "https://tonaton.com/en/ads/ghana/cars?1"
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
print(item.text.encode('utf-8').strip())
Yields (not sure what the 'b' is doing):
b'Hyundai Veloster 2013'
b'Ford Edge 2009'
b'Mercedes-Benz C300 2016'
b'Mazda Demio 2007'
b'Hyundai Santa fe 2005'
# And so on...
The problem is that 1) if you call find(), it will stop after you find the first match given your params. Using findAll() will dump all matches into a list which you then can iterate through and process as needed. And 2) the result you get from a call to find() is a broken structure of the original HTML. Thus the next find() calls won't work.
import requests
from bs4 import BeautifulSoup as bsoup
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
item_title = soup.findAll("a",attrs={"class":"item-title h4"})
for item in item_title:
model = item.text.encode('utf-8').strip()
item_meta = soup.findAll("p",attrs={"class":"item-meta"})
for item in item_meta:
milleage = item.text.encode('utf-8').strip()
item_location = soup.findAll("p",attrs={"class":"item-location"})
for item in item_location:
location = item.text.encode('utf-8').strip()
item_info = soup.findAll("p",attrs={"class":"item-info"})
for item in item_info:
price = item.text.encode('utf-8').strip()
with open('index.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, milleage, location, datetime.now()])

Categories

Resources