Scrapy Crawler: Scrape lists from pages - python

Objective: Crawl this page
https://www.cardplayer.com/poker-tournaments/monthly/2021/06
And then get a list of all the tournaments on each page.
here is my code
from scrapy.crawler import CrawlerProcess
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
import pandas as pd
mydf = pd.DataFrame()
class TournamentsSpider(CrawlSpider):
name = 'tournaments'
allowed_domains = ['www.cardplayer.com']
start_urls = ['https://www.cardplayer.com/poker-tournaments/monthly/2021/06']
rules = (
Rule(LinkExtractor(restrict_xpaths='/html/body/div[5]/div/div[2]/div[2]/div[3]/div/table/tbody/tr/td[2]/a'),
callback='parse_item', follow=True),
)
def parse_item(self, response):
# I'm aware that some of the pages have two tables(I was thinking an if statement on the length of response and then running for table 1 on 1 table pages and table 2 on 2 table pages
for series in response.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbody'):
mydf["Event"] = series.xpath('/html/body/div[5]/div/div[2]/div[3]/table/tbodytr/td[1]/a/text()')
mydf["start"] = series.xpath('.//tr/td[2]/text()')
mydf["days"] = series.xpath('.//tr/td[3]/text()')
mydf["buyin"] = series.xpath('.//tr/td[4]/text()')
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(TournamentsSpider)
process.start()
print(mydf)
I can see the crawler finds all the URLs but the output only comes back for just 1 page so I'm doing something wrong.

Here is how i attempted doing this using bs4, just need to enter the number of years your wanting to collect.
# Get Product Page Links
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
baseurl = 'https://www.cardplayer.com/poker-tournaments/monthly/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}
Tournaments = []
def GetPageData(url):
#Get singular page info
r = requests.get(url)
soup = BeautifulSoup(r.content, 'lxml')
# Get all tr elements with empty class
productlist = soup.find_all('tr', class_='')
for i, item in enumerate(productlist):
# Skip first row of table titles
if (i != 0):
# remove spaces
RawTournamentInfo = str(item.text).strip()
# splits into list by new lines
RawTournamentInfo = RawTournamentInfo.splitlines()
# Create empty in strings
Date = ''
Name = ''
Location = ''
# had to loop of list, forsome reason not allowing direct calling
for i, item in enumerate(RawTournamentInfo):
if i == 0: Date = item
if i == 1: Name = item
if i == 2: Location = item
# Creating object and saving to list
if (Date != "Dates") and (Date != 'No tournament series found.'):
print('Added: ', Name)
tournament = {
'date': Date,
'name': Name,
'location': Location
}
Tournaments.append(tournament)
r.close()
def GetTournaments(yearsToCollect):
#Get Current Year/Month
today = datetime.today()
currentMonth = today.month
currentYear = today.year
for year in range(yearsToCollect):
#Finish current Year
if (year == 0):
for i in range(12 - currentMonth):
GetPageData(baseurl + str(currentYear) + '/' + str(currentMonth + i))
#All other years
else:
for i in range(12):
GetPageData(baseurl + str(currentYear + year) + '/' + str(i))
# Save to .xlsx
Tournamentsdf = pd.DataFrame(Tournaments)
Tournamentsdf.to_excel('Tournaments.xlsx', index=False)
if __name__ == "__main__":
yearsToCollect = 2
GetTournaments(yearsToCollect)

Related

Multiple values against the same tag not scraping

I'm getting no values for my "Number of Rooms" and "Room" search.
https://www.zoopla.co.uk/property/uprn/906032139/
I can see here that I should be returning something but not getting anything.
Can anyone possibly point me in the right direction of how to solve this? I am not even sure what to search for as it's not erroring. I thought it would put all the data in and then I would need to figure out a way to seperate it. Do I need to maybe scrape it into a dictionary?
import requests
from bs4 import BeautifulSoup as bs
import numpy as np
import pandas as pd
import matplotlib as plt
import time
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.157 Safari/537.36",
"Accept-Language": "en-US,en;q=0.5",
"Referer": "https://google.co.uk",
"DNT": "1"
}
page = 1
addresses = []
while page != 2:
url = f"https://www.zoopla.co.uk/house-prices/edinburgh/?pn={page}"
print(url)
response = requests.get(url, headers=headers)
print(response)
html = response.content
soup = bs(html, "lxml")
time.sleep(1)
for address in soup.find_all("div", class_="c-rgUPM c-rgUPM-pnwXf-hasUprn-true"):
details = {}
# Getting the address
details["Address"] = address.h2.get_text(strip=True)
# Getting each addresses unique URL
scotland_house_url = f'https://www.zoopla.co.uk{address.find("a")["href"]}'
details["URL"] = scotland_house_url
scotland_house_url_response = requests.get(
scotland_house_url, headers=headers)
scotland_house_soup = bs(scotland_house_url_response.content, "lxml")
# Lists status of the property
try:
details["Status"] = [status.get_text(strip=True) for status in scotland_house_soup.find_all(
"span", class_="css-10o3xac-Tag e164ranr11")]
except AttributeError:
details["Status"] = ""
# Lists the date of the status of the property
try:
details["Status Date"] = [status_date.get_text(
strip=True) for status_date in scotland_house_soup.find_all("p", class_="css-1jq4rzj e164ranr10")]
except AttributeError:
details["Status Date"] = ""
# Lists the value of the property
try:
details["Value"] = [value.get_text(strip=True).replace(",", "").replace(
"£", "") for value in scotland_house_soup.find_all("p", class_="css-1x01gac-Text eczcs4p0")]
except AttributeError:
details["Value"] = ""
# Lists the number of rooms
try:
details["Number of Rooms"] = [number_of_rooms.get_text(strip=True) for number_of_rooms in scotland_house_soup.find_all(
"p", class_="css-82kmy1 e13gx5i3")]
except AttributeError:
details["Number of Rooms"] = ""
# Lists type of room
try:
details["Room"] = [room.get_text(strip=True) for room in scotland_house_soup.find_all(
"span", class_="css-1avcdf2 e13gx5i4")]
except AttributeError:
details["Room"] = ""
addresses.append(details)
page = page + 1
for address in addresses[:]:
print(address)
print(response)
Selecting by class_="css-1avcdf2 e13gx5i4" seems brittle, the class might change all the time. Try different CSS selector:
import requests
from bs4 import BeautifulSoup
url = "https://www.zoopla.co.uk/property/uprn/906032139/"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
tag = soup.select_one('#timeline p:has(svg[data-testid="bed"]) + p')
no_beds, beds = tag.get_text(strip=True, separator=" ").split()
print(no_beds, beds)
Prints:
1 bed
If you want all types of rooms:
for detail in soup.select("#timeline p:has(svg[data-testid]) + p"):
n, type_ = detail.get_text(strip=True, separator="|").split("|")
print(n, type_)
Prints:
1 bed
1 bath
1 reception

web-scraping and pagination with python, csv, beautifulsoup and Pandas

This website https://aviation-safety.net/wikibase/ DB begins from year 1902 to 2022. The code presented here captures some years for misses some as well. The years before 1912 and the year after 2021 are not captured. I want to scrape All Accidents for each type of aircraft for all or by year(s). This webDB starts from https://aviation-safety.net/wikibase/dblist.php?Year=1902 and should end on https://aviation-safety.net/wikibase/dblist.php?Year=2022. Currently, the code dumps the results in .csv file, but it could also be in SQLite.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
# use a default looking header to cover my tracks in case they block requests that don't have "accept" and "user-agent" which sometimes happens
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')]) # get the maximum number of pages using "list comprehension", I get all the links at the bottom of the page ('a' tags) and the get the [href] for each, but split it on "=" making each a list, then get the last one ([-1]) and turn the text into an integer so I can get the max of all the integers ie the last page number
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for index,row in enumerate(table.find_all('tr',{'class':regex})):
if index == 0:
continue
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
df= pd.DataFrame(info)
df.to_csv(f'{year}_aviation-safety.csv', encoding='utf-8-sig', index=False)
if __name__ == "__main__":
START = 1901
STOP = 2023
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
Lmao, I wrote that code for someone on this site once before. I've edited to work for the missing years here:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
def scrape_year(year):
try:
headers = {
'accept':'*/*',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
}
url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page=1'
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.text,'html.parser')
page_container = soup.find('div',{'class':'pagenumbers'})
try:
pages = max([int(page['href'].split('=')[-1]) for page in page_container.find_all('a')])
except:
pages = 1
info = []
for page in range(1,pages+1):
new_url = f'https://aviation-safety.net/wikibase/dblist.php?Year={year}&sorteer=datekey&page={page}'
print(new_url)
data = requests.get(new_url,headers=headers)
soup = BeautifulSoup(data.text,'html.parser')
table = soup.find('table',{'class':'hp'})
regex = re.compile('list.*')
for row in table.find_all('tr',{'class':regex}):
acc_link = 'https://aviation-safety.net/'+row.find('a')['href']
try:
acc_date = datetime.strptime(row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
try:
acc_date = datetime.strptime("01-01"+row.find('a').text.strip(),'%d-%b-%Y').strftime('%Y-%m-%d')
except ValueError:
continue
acc_type = row.find_all('td')[1].text
acc_reg = row.find_all('td')[2].text
acc_operator = row.find_all('td')[3].text
acc_fat = row.find_all('td')[4].text
acc_location = row.find_all('td')[5].text
acc_dmg = row.find_all('td')[7].text
item = {
'acc_link' : acc_link,
'acc_date': acc_date,
'acc_type': acc_type,
'acc_reg': acc_reg,
'acc_operator' :acc_operator,
'acc_fat':acc_fat,
'acc_location':acc_location,
'acc_dmg':acc_dmg
}
info.append(item)
return info
except Exception as e:
print(e, url)
return []
if __name__ == "__main__":
START = 2022
STOP = 2023
years = [year for year in range(START,STOP+1)]
print(f'Scraping {len(years)} years of data')
with concurrent.futures.ThreadPoolExecutor(max_workers=60) as executor:
final_list = executor.map(scrape_year,years)
list_of_dicts= list(final_list)
flat_list = [item for sublist in list_of_dicts for item in sublist] #convert list of lists into one big list
df= pd.DataFrame(flat_list)
df.to_csv('all_years_aviation-safety.csv',index=False)

Web-Scraping using BeautifulSoup (missing values when scraping)

I have been trying to webscrape a realtor website using BeautifulSoup and encountered 2 difficulties that I cannot seem to fix.
Difficulties:
When I run my code below, I am missing some date values. The dataframe should hold 68 rows of data scraped from the first page. The description and title scrapes return 68 rows, but the date scrape returns 66. I don't get N/A values returned if its missing either. Does anyone have an idea why this is? When I inspected the website elements it had the same tags, except it is listed as VIP or Special (promotion) apartments.
Secondly, I cannot seem to figure out how to scrape meta itemprop tags. I keep getting blank values when I use:
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
Thank you in advance for any assistance you could provide.
Python Code:
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
title = []
description = []
date = []
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('a', attrs={'class':'announcement-block__title'}):
text = tag2.get_text().strip()
if len(text) > 0:
title.append(text)
else:
title.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__description'}):
text = tag2.get_text().strip()
if len(text) > 0:
description.append(text)
else:
description.append('N/A')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
for tag2 in tag.findAll('div', attrs={'class':'announcement-block__date'}):
text = tag2.get_text().strip()
if len(text) > 0:
date.append(text)
else:
date.append('N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(title,description,date)),columns=['Title', 'Description', 'Date'])
You get 66 items because your date[] contains only 66 elements, therefore, you need to check all three fields at once in one for loop. Your if else checks do nothing as there are no announcement-block__date divs with empty content on the page.
from urllib.request import urlopen,Request
from bs4 import BeautifulSoup as bsoup
import ssl
import pandas as pd
def get_headers():
#Headers
headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept-language':'en-US,en;q=0.9',
'cache-control':'max-age=0',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'}
return headers
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
count = 1 # for pagination
#Make list holder
info = {
'title': [],
'description': [],
'date': []
}
urls = ['https://www.unegui.mn/l-hdlh/l-hdlh-zarna/oron-suuts-zarna/5-r/']
for x in urls:
count=1
y=x
while(count < 2): # will get only 1st page
print(x)
req = Request(x, headers=get_headers()) #req all headers
htmlfile = urlopen(req)
htmltext = htmlfile.read()
soup = bsoup(htmltext,'html.parser')
for tag in soup.findAll('div',attrs={'class':'announcement-block-text-container announcement-block__text-container'}):
title = tag.find('a', attrs={'class':'announcement-block__title'})
description = tag.find('div', attrs={'class':'announcement-block__description'})
date = tag.find('div', attrs={'class':'announcement-block__date'})
info['title'].append(title.get_text().strip() if title else 'N/A')
info['description'].append(description.get_text().strip() if description else 'N/A')
info['date'].append(date.get_text().strip() if date else 'N/A')
# Go to next page
count=count+1
page = '?page='+str(count)
x=y+page
data_frame = pd.DataFrame(list(zip(info['title'], info['description'], info['date'])),columns=['Title', 'Description', 'Date'])
print(len(info['title']), len(info['description']), len(info['date']))
print(data_frame)
About your second question, a similar question has already been answered here

Pagination link are repetitive in my BeautfiulSoup Python Code

from bs4 import BeautifulSoup
import requests
import csv
class Parse():
def __init__(self):
self.row_list = []
self.base_url ='https://www.tripadvisor.co.uk'
def parse(self,url): # correct
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'}
response = requests.get(url,headers).text
soup = BeautifulSoup(response,'html.parser')
next_link = soup.find('a',class_='_23XJjgWS _1hF7hP_9 _2QvUxWyA')
next_page = self.base_url+next_link.attrs['href']
cards = soup.find_all('section',class_='_2TabEHya _3YhIe-Un')
for card in cards:
name = card.find('div',class_='_1gpq3zsA _1zP41Z7X').text
rating = str(card.find('svg',class_='zWXXYhVR'))
rating = self.remove(filter_col=rating)
review_count = card.find('span',class_='DrjyGw-P _26S7gyB4 _14_buatE _1dimhEoy').text
status = card.find('div',class_='DrjyGw-P _26S7gyB4 _3SccQt-T').text
row_list = [name,rating,status,review_count]
return next_page,row_list
def remove(self,filter_col):
rating = filter_col.split(' ')[1]
rating = rating[-3:]
return rating
def write_csv(self,row_list):
with open('top_sites.csv','w') as file:
csv_writer = csv.writer(file, delimiter=',')
csv_writer.writerows(row_list)
if __name__=='__main__':
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html"
parsing = Parse()
next_url,row_list = parsing.parse(url=url)
print(next_url)
PS C:\Users\Caspe\PycharmProjects\Selenium Test> & "c:/Users/Caspe/PycharmProjects/Selenium Test/.venv/Scripts/python.exe" "c:/Users/Caspe/PycharmProjects/Selenium Test/Demo/tripadvisor_topattract.py"
https://www.tripadvisor.co.uk/Attractions-g294190-Activities-Myanmar.html
PS C:\Users\Caspe\PycharmProjects\Selenium Test>
I'm trying to scrape data from TripAdvisor Website using BeautifulSoup.
Link: https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html
Instead of going to next page, the link is repeated itself. Is there a solution for my problem?
I've selected the correct selector for the soup and I was able to scrape data.
To get pagination working, it's necessary to change the -oa<index>- part in URL:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa{}-Myanmar.html"
data = []
for page in range(0, 4): # <--- increase page count here
print("Getting page {}..".format(page))
soup = BeautifulSoup(
requests.get(url.format(page * 30)).content, "html.parser"
)
titles = soup.select('span[name="title"]')
for title in titles:
no, t = title.get_text(strip=True, separator="|").split("|")
rating = title.find_next("svg")
review_count = rating.find_next("span")
data.append(
(
no,
t,
rating["title"],
review_count.text,
review_count.find_next(
"div", class_="DrjyGw-P _26S7gyB4 _3SccQt-T"
).text,
)
)
with open("data.csv", "w") as f_out:
w = csv.writer(f_out)
w.writerows(data)
Writes data.csv (screenshot from LibreOffice):

Extracting table data using beautifulsoup in python

I have a webpage - https://www.1800wheelchair.com/category/369/transport-wheelchairs/ from which I want to extract name, url, sku and specifications (from table) of each product. I wrote the code below but I am getting an empty excel file. I have been trying to fix it for long but cant think of what is going wrong.
import requests
import xlsxwriter
from bs4 import BeautifulSoup
def cpap_spider(max_pages):
global row_i
page=1
while page<=max_pages:
url= "https://www.1800wheelchair.com/category/369/transport-wheelchairs/?p=" +str(page)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
for link in soup.findAll("h2", {"class":"product-name"}):
href=link.find("a")['href']
title = link.string
worksheet.write(row_i, 0, title)
each_item(href)
print(href)
#print(title)
page+=1
def each_item(item_url):
global cols_names, row_i
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(item_url, headers=headers).content, 'html.parser')
table=soup.find("table", {"class":"specifications "})
if table:
table_rows = table.find_all('tr')
else:
return
for row in table_rows:
cols = row.find_all('td')
for ele in range(0,len(cols)):
temp = cols[ele].text.strip()
if temp:
if temp[-1:] == ":":
temp = temp[:-1]
# Name of column
if ele == 0:
try:
cols_names_i = cols_names.index(temp)
except:
cols_names.append(temp)
cols_names_i = len(cols_names) - 1
worksheet.write(0, cols_names_i + 1, temp)
continue;
worksheet.write(row_i, cols_names_i + 1, temp)
row_i += 1
cols_names=[]
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('all_appended.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "Title")
cpap_spider(1)
workbook.close()
You have an extra space in your class name {"class":"specifications "}), removed and the excel file was generated with multiple specs columns and data lines.
As a suggestion, if you're willing to add some extra libraries, you can use pandas do read the specifications table as data frames with pd.read_html and use the included function df.to_excel to write an excel file (which can use the same engine xlsxwriter you're already using) without worrying about incrementing rows and columns.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from functools import reduce
AGENT = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
BASE_URL = "https://www.1800wheelchair.com/"
CATG_URL = "category/369/transport-wheelchairs/?p="
def cpap_spider(max_pages):
chair_names = ["Specs"]
chair_tables = ''
page = 1
while page <= max_pages:
url = BASE_URL+CATG_URL+str(page)
soup = BeautifulSoup(requests.get(
url, headers=AGENT).content, 'html.parser')
for link in soup.findAll("h2", {"class": "product-name"}):
href = link.find("a")['href']
title = link.string
chair_name = href.replace(BASE_URL+"product/","")
chair_names.append(chair_name[:20])
chair_tables += each_item(href)
print(href)
page += 1
return [chair_names, chair_tables]
def each_item(item_url):
soup = BeautifulSoup(requests.get(
item_url, headers=AGENT).content, 'html.parser')
table = soup.find("table", {"class": "specifications"})
if table:
return str(table)
chair_name, chair_list = cpap_spider(1)
# create a list of dataframes from html tables
df = pd.read_html(chair_list)
# merge the spec. tables list into one dataframe
all_chairs = reduce(lambda left, right: pd.merge(left, right, on=[0], how='outer'), df)
# add chair names as indices
all_chairs.columns = chair_name
all_chairs.set_index("Specs", drop=True, inplace=True)
# transpose to get chairs as index and specs as columns
all_chairs = all_chairs.T
all_chairs.to_excel("all_appended.xlsx")
Output from all_appended.xlsx

Categories

Resources