I am using bs4 to write a webscraper to obtain funding news data.
The first part of my code extracts the title, link, summary and date
of each article for n number of pages.
The second part of my code loops through the link column and inputs
the resulting url in a new function, which extracts the url of the
company in question.
For the most part, the code works fine (40 pages scraped without errors). I am trying to stress test it by raising it to 80 pages, but i'm running into KeyError: 'href' and I don't know how to fix this.
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
def clean_data(column):
df[column]= df[column].str.encode('ascii', 'ignore').str.decode('ascii')
#extract
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
url = f'https://www.uktechnews.info/category/investment-round/series-a/page/{page}/'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
#transform
def transform(soup):
for item in soup.find_all('div', class_ = 'post-block-style'):
title = item.find('h3', {'class': 'post-title'}).text.replace('\n','')
link = item.find('a')['href']
summary = item.find('p').text
date = item.find('span', {'class': 'post-meta-date'}).text.replace('\n','')
news = {
'title': title,
'link': link,
'summary': summary,
'date': date
}
newslist.append(news)
return
newslist = []
#subpage
def extract_subpage(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
r = requests.get(url, headers)
soup_subpage = BeautifulSoup(r.text, 'html.parser')
return soup_subpage
def transform_subpage(soup_subpage):
main_data = soup_subpage.select("div.entry-content.clearfix > p > a")
if len(main_data):
subpage_link = {
'subpage_link': main_data[0]['href']
}
subpage.append(subpage_link)
else:
subpage_link = {
'subpage_link': '--'
}
subpage.append(subpage_link)
return
subpage = []
#load
page = np.arange(0, 80, 1).tolist()
for page in tqdm(page):
try:
c = extract(page)
transform(c)
except:
None
df1 = pd.DataFrame(newslist)
for url in tqdm(df1['link']):
t = extract_subpage(url)
transform_subpage(t)
df2 = pd.DataFrame(subpage)
Here is a screenshot of the error:
Screenshot
I think the issue is that my if statement for the transform_subpage function does not account for instances where main_data is not an empty list but does not contain href links. I am relatively new to Python so any help would be much appreciated!
You are correct, it's caused by main_data[0] not having an 'href' attribute at some point. You can try changing the logic to something like:
def transform_subpage(soup_subpage):
main_data = soup_subpage.select("div.entry-content.clearfix > p > a")
if len(main_data):
if 'href' in main_data[0].attrs:
subpage_link = {
'subpage_link': main_data[0]['href']
}
subpage.append(subpage_link)
else:
subpage_link = {
'subpage_link': '--'
}
subpage.append(subpage_link)
Also just a note, it's probably not a great idea to iterate through a variable list, and use the same variable name for each item in the list. So change to something like:
page_list = np.arange(0, 80, 1).tolist()
for page in tqdm(page_list):
Related
I'm following a Udemy course on learning BS4 and it seems to be a bit outdated so I'm having trouble with this part.
The objective is to scrape the price of this TV from this amazon page, and in the course the instructor also gets this error and says he fixes it by changing the class name he's searching for via findAll. I tried the same thing (meaning different class not the same one he used) and was met again with the attribute error. According to the answer for a similar issue, the class being searched for didn't contain what was being looked for, but I don't believe the same is happening to me.
The code: https://pastebin.com/SMQBXt31
`
from datetime import datetime
import requests
import csv
import bs4
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.3 Safari/605.1.15"
REQUEST_HEADER = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US, en;q=0.5"
}
def get_page_html(url):
res = requests.get(url=url, headers=REQUEST_HEADER) #res = response
return res.content
def get_product_price(soup):
main_price_span = soup.find("span", attrs={
"class": "a-price aok-align-center reinventPricePriceToPayPadding priceToPay"
})
price_spans = main_price_span.findAll("span")
for span in price_spans:
price = span.text.strip().replace("$", "").replace(",", "")
print(price)
def extract_product_info(url):
product_info = {}
print(f"Scraping URL: {url}")
html = get_page_html(url)
soup = bs4.BeautifulSoup(html, "lxml")
product_info["price"] = get_product_price(soup)
if __name__ == '__main__':
with open("amazon_products_urls.csv", newline="") as csvfile:
reader = csv.reader(csvfile, delimiter=",")
for row in reader:
url = row[0]
print(extract_product_info(url))
`
The website:https://www.amazon.com/Hisense-Premium-65-Inch-Compatibility-65U8G/dp/B091XWTGXL/ref=sr_1_1_sspa?crid=3NYCKNFHL6DU2&keywords=hisense%2Bpremium%2B65%2Binch&qid=1651840513&sprefix=hisense%2Bpremium%2B65%2Binch%2B%2Caps%2C116&sr=8-1-spons&spLa=ZW5jcnlwdGVkUXVhbGlmaWVyPUEyVzUyTjBMS1JCVFVRJmVuY3J5cHRlZElkPUEwNDY2ODc0MlozVlFMVFJKQ0s2VyZlbmNyeXB0ZWRBZElkPUEwODI5OTgxMTRZSjdMMzYyQjk4NyZ3aWRnZXROYW1lPXNwX2F0ZiZhY3Rpb249Y2xpY2tSZWRpcmVjdCZkb05vdExvZ0NsaWNrPXRydWU&th=1
There are lot of spans from that you have to select only the price span class correctly which are located in [class="a-size-mini olpWrapper"]
price_spans = main_price_span.find_all("span",class_="a-size-mini olpWrapper")
for span in price_spans:
price = span.text.strip().replace("$", "").replace(",", "")
print(price)
#OR
price_spans =[x.get_text(strip=True).replace("$", "") for x in main_price_span.find("span",class_="a-size-mini olpWrapper")]
I need to get text "Платонов А.П." Here's my code by far.
import requests
from bs4 import BeautifulSoup
from pip._internal.network.utils import HEADERS
URL = "https://www.moscowbooks.ru/books/?sortby=name&sortdown=false"
HEADERS = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", "accept" : "*/*"}
HOST = "https://www.moscowbooks.ru"
def get_html(url, params=None):
r = requests.get(url, headers = HEADERS, params = params)
return r
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("div", class_ = "catalog__item")
books = []
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.get_text()
else:
author_check = "Автор не указан"
books.append({
"title": item.find("div", class_ = "book-preview__title").get_text(strip=True),
"author": author_check,
"link": HOST + item.find("a", class_ = "book-preview__title-link").get("href"),
"cost": item.find("div", class_="book-preview__price").get_text(strip=True),
})
print(books)
print(len(books))
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print("Error")
parse()
I get problems with author, because it get like this:
<a class="author-name" href="/catalog/author/21381/">Платонов А. П. </a>
Also need a little help with price because sometimes it gets '2\xa0274' instead of '2 274'.
The problem is that you define "author": author_check in your dictionary, while author_check = item.find("a", class_="author-name") and author = author_check.get_text(). You can change your for loop into something like this
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.text
else:
author = "Автор не указан"
For you issue with the display of the prices, you can just replace \xa0 with a comma or space.
"cost": item.find("div", class_="book-preview__price").get_text(strip=True).replace(u"\xa0", ",")
I've had to deal with similar problem. You can do the following:
author = author_check.get_text().split('>')[-2].split('<')[0]
You might have to substitute -2 with -1.
I am scraping data from the Rakuten Japanese e-commerce website. I am using requests-html and Beautiful soup.
And the problem is when I request from my local pc (127.0.0.1) it's working fine. But when I request from my ec2 server getting Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/> this message and no data or HTML page is found. And another case when I use wget and request from the server the page URL I get a full page. But my script doesn't work.
Here is my code sample:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
session = HTMLSession()
session.headers.update(headers)
request_url = base_url + query_keyword
resp = session.get(request_url)
soup = BeautifulSoup(resp.text, "lxml")
return soup
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output = {
'title': product_title,
}
output_list.append(output)
print(output_list)
return output_list
def main_rakuten_product_search(query):
query_keyword = query
page = get_search_url(query_keyword)
product_lists = feature_product_details(page)
return product_lists
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
main_rakuten_product_search(queries)
Sample output when running local server:
[
{
"title": "【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース リレーアタック対策 ボックス 箱 電波遮断ケース RFIDブロッキング 高級PUレザー 高級車盗難防止 カーセキュリティ 高級感溢れ レクサス(グレー)",
}
]
But don't get any response when running it on my server: Just show
this message Reference #<esi:vars name="AKAMAI_DEBUG_STRING"/>
If anyone has any idea on how this could be done, I would be grateful to hear.
I've tried your code on an EC2 in ap-northeast-1 (Tokyo) and I'm getting the sample output.
So, here are few things to check:
make sure your EC2 has the right ports open
double check the headers (I've modified yours a bit - see code below)
check your query input; maybe some of them are malformed?
don't spray the rakuten server with too many requests from one EC2; maybe they're blocking you already
Here's your code after some slight tuning:
from bs4 import BeautifulSoup
from requests_html import HTMLSession
def get_search_url(query_keyword):
base_url = 'https://search.rakuten.co.jp/search/mall/'
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36',
'referer': 'https://www.rakuten.co.jp/'
}
session = HTMLSession()
session.headers.update(headers)
return BeautifulSoup(session.get(base_url + query_keyword).content, "lxml")
def feature_product_details(url):
output_list = []
for i in url.find_all("div", attrs={"class": "dui-card searchresultitem"}):
product_title = i.find("div", attrs={"class": "content title"})
if product_title is not None:
product_title = product_title.getText()
else:
product_title = ""
output_list.append({'title': product_title})
return output_list
def main_rakuten_product_search(query):
return feature_product_details(get_search_url(query))
if __name__ == '__main__':
queries = '【レビュー特典中】スマートキー 電波遮断ケース 電波遮断ボックス リレーアタック防止用 キーケース '
print(main_rakuten_product_search(queries))
I'm trying to scrape website from a job postings data, and the output looks like this:
[{'job_title': 'Junior Data Scientist','company': '\n\n BBC',
summary': "\n We're now seeking a Junior Data Scientist to
come and work with our Marketing & Audiences team in London. The Data
Science team are responsible for designing...", 'link':
'www.jobsite.com',
'summary_text': "Job Introduction\nImagine if Netflix, The Huffington Post, ESPN, and Spotify were all rolled into one....etc
I want to create a dataframe, or a CSV, that looks like this:
right now, this is the loop I'm using:
for page in pages:
source = requests.get('https://www.jobsite.co.uk/jobs?q=data+scientist&start='.format()).text
soup = BeautifulSoup(source, 'lxml')
results = []
for jobs in soup.findAll(class_='result'):
result = {
'job_title': '',
'company': '',
'summary': '',
'link': '',
'summary_text': ''
}
and after using the loop, I just print the results.
What would be a good way to get the output in a dataframe? Thanks!
Look at the pandas Dataframe API. There are several ways you can initialize a dataframe
list of dictionaries
list of lists
You just need to append either a list or a dictionary to a global variable, and you should be good to go.
results = []
for page in pages:
source = requests.get('https://www.jobsite.co.uk/jobs?q=data+scientist&start='.format()).text
soup = BeautifulSoup(source, 'lxml')
for jobs in soup.findAll(class_='result'):
result = {
'job_title': '', # assuming this has value like you shared in the example in your question
'company': '',
'summary': '',
'link': '',
'summary_text': ''
}
results.append(result)
# results is now a list of dictionaries
df= pandas.DataFrame(results)
One other suggestion, don't think about dumping this in a dataframe within the same program. Dump all your HTML files first into a folder, and then parse them again. This way if you need more information from the page which you hadn't considered before, or if a program terminates due to some parsing error or timeout, the work is not lost. Keep parsing separate from crawling logic.
I think you need to define the number of pages and add that into your url (ensure you have a placeholder for the value which I don't think your code, nor the other answer have). I have done this via extending your url to include a page parameter in the querystring which incorporates a placeholder.
Is your selector of class result correct? You could certainly also use for job in soup.select('.job'):. You then need to define appropriate selectors to populate values. I think it easier to grab all the job links for each page then visit the page and extract the values from a json like string in the page. Add Session to re-use connection.
Explicit waits required to prevent being blocked
import requests
from bs4 import BeautifulSoup as bs
import json
import pandas as pd
import time
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
results = []
links = []
pages = 3
with requests.Session() as s:
for page in range(1, pages + 1):
try:
url = 'https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page={}'.format(page)
source = s.get(url, headers = headers).text
soup = bs(source, 'lxml')
links.append([link['href'] for link in soup.select('.job-title a')])
except Exception as e:
print(e, url )
finally:
time.sleep(2)
final_list = [item for sublist in links for item in sublist]
for link in final_list:
source = s.get(link, headers = headers).text
soup = bs(source, 'lxml')
data = soup.select_one('#jobPostingSchema').text #json like string containing all info
item = json.loads(data)
result = {
'Title' : item['title'],
'Company' : item['hiringOrganization']['name'],
'Url' : link,
'Summary' :bs(item['description'],'lxml').text
}
results.append(result)
time.sleep(1)
df = pd.DataFrame(results, columns = ['Title', 'Company', 'Url', 'Summary'])
print(df)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8-sig',index = False )
Sample of results:
I can't imagine you want all pages but you could use something similar to:
import requests
from bs4 import BeautifulSoup as bs
import json
import pandas as pd
import time
headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36'}
results = []
links = []
pages = 0
def get_links(url, page):
try:
source = s.get(url, headers = headers).text
soup = bs(source, 'lxml')
page_links = [link['href'] for link in soup.select('.job-title a')]
if page == 1:
global pages
pages = int(soup.select_one('.page-title span').text.replace(',',''))
except Exception as e:
print(e, url )
finally:
time.sleep(1)
return page_links
with requests.Session() as s:
links.append(get_links('https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page=1',1))
for page in range(2, pages + 1):
url = 'https://www.jobsite.co.uk/jobs?q=data+scientist&start=1&page={}'.format(page)
links.append(get_links(url, page))
final_list = [item for sublist in links for item in sublist]
for link in final_list:
source = s.get(link, headers = headers).text
soup = bs(source, 'lxml')
data = soup.select_one('#jobPostingSchema').text #json like string containing all info
item = json.loads(data)
result = {
'Title' : item['title'],
'Company' : item['hiringOrganization']['name'],
'Url' : link,
'Summary' :bs(item['description'],'lxml').text
}
results.append(result)
time.sleep(1)
df = pd.DataFrame(results, columns = ['Title', 'Company', 'Url', 'Summary'])
print(df)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8-sig',index = False )
I am new to web scraping, so my apologies in advance if I'm misunderstanding anything...
I am trying to get data from ESPN. Here is my python code:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = 'http://espn.go.com/nba/teams'
r = requests.get(url)
soup = BeautifulSoup(r.text)
tables = soup.find_all('dl')
teams = []
prefix_1 = []
prefix_2 = []
teams_urls = []
for table in tables:
lis = table.find_all('dt', text=False)
print lis
for li in lis:
info = dt
teams.append(info.text)
url = info['href']
teams_urls.append(url)
prefix_1.append(url.split('/')[-2])
prefix_2.append(url.split('/')[-1])
print (teams)
When I print at various points, i am getting empty brackets [] as a return. Please help. Thanks.
You are extracting the team names from the menu, but the actual page content contains teams also.
Let's use CSS selectors to get to the each team link on the page. As a result, let's construct a list of dictionaries with team names and urls inside:
import requests
from bs4 import BeautifulSoup
url = 'http://espn.go.com/nba/teams'
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'})
soup = BeautifulSoup(r.content, 'lxml')
teams = []
for link in soup.select('div.mod-table div.mod-content ul li h5 a[href]'):
teams.append({
'name': link.text,
'url': link['href']
})
print(teams)
Prints:
[
{'name': u'Boston Celtics', 'url': 'http://espn.go.com/nba/team/_/name/bos/boston-celtics'},
{'name': u'Brooklyn Nets', 'url': 'http://espn.go.com/nba/team/_/name/bkn/brooklyn-nets'},
...
{'name': u'Utah Jazz', 'url': 'http://espn.go.com/nba/team/_/name/utah/utah-jazz'}
]