I have a webpage - https://www.1800wheelchair.com/category/369/transport-wheelchairs/ from which I want to extract name, url, sku and specifications (from table) of each product. I wrote the code below but I am getting an empty excel file. I have been trying to fix it for long but cant think of what is going wrong.
import requests
import xlsxwriter
from bs4 import BeautifulSoup
def cpap_spider(max_pages):
global row_i
page=1
while page<=max_pages:
url= "https://www.1800wheelchair.com/category/369/transport-wheelchairs/?p=" +str(page)
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
for link in soup.findAll("h2", {"class":"product-name"}):
href=link.find("a")['href']
title = link.string
worksheet.write(row_i, 0, title)
each_item(href)
print(href)
#print(title)
page+=1
def each_item(item_url):
global cols_names, row_i
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
soup = BeautifulSoup(requests.get(item_url, headers=headers).content, 'html.parser')
table=soup.find("table", {"class":"specifications "})
if table:
table_rows = table.find_all('tr')
else:
return
for row in table_rows:
cols = row.find_all('td')
for ele in range(0,len(cols)):
temp = cols[ele].text.strip()
if temp:
if temp[-1:] == ":":
temp = temp[:-1]
# Name of column
if ele == 0:
try:
cols_names_i = cols_names.index(temp)
except:
cols_names.append(temp)
cols_names_i = len(cols_names) - 1
worksheet.write(0, cols_names_i + 1, temp)
continue;
worksheet.write(row_i, cols_names_i + 1, temp)
row_i += 1
cols_names=[]
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('all_appended.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "Title")
cpap_spider(1)
workbook.close()
You have an extra space in your class name {"class":"specifications "}), removed and the excel file was generated with multiple specs columns and data lines.
As a suggestion, if you're willing to add some extra libraries, you can use pandas do read the specifications table as data frames with pd.read_html and use the included function df.to_excel to write an excel file (which can use the same engine xlsxwriter you're already using) without worrying about incrementing rows and columns.
import requests
from bs4 import BeautifulSoup
import pandas as pd
from functools import reduce
AGENT = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'}
BASE_URL = "https://www.1800wheelchair.com/"
CATG_URL = "category/369/transport-wheelchairs/?p="
def cpap_spider(max_pages):
chair_names = ["Specs"]
chair_tables = ''
page = 1
while page <= max_pages:
url = BASE_URL+CATG_URL+str(page)
soup = BeautifulSoup(requests.get(
url, headers=AGENT).content, 'html.parser')
for link in soup.findAll("h2", {"class": "product-name"}):
href = link.find("a")['href']
title = link.string
chair_name = href.replace(BASE_URL+"product/","")
chair_names.append(chair_name[:20])
chair_tables += each_item(href)
print(href)
page += 1
return [chair_names, chair_tables]
def each_item(item_url):
soup = BeautifulSoup(requests.get(
item_url, headers=AGENT).content, 'html.parser')
table = soup.find("table", {"class": "specifications"})
if table:
return str(table)
chair_name, chair_list = cpap_spider(1)
# create a list of dataframes from html tables
df = pd.read_html(chair_list)
# merge the spec. tables list into one dataframe
all_chairs = reduce(lambda left, right: pd.merge(left, right, on=[0], how='outer'), df)
# add chair names as indices
all_chairs.columns = chair_name
all_chairs.set_index("Specs", drop=True, inplace=True)
# transpose to get chairs as index and specs as columns
all_chairs = all_chairs.T
all_chairs.to_excel("all_appended.xlsx")
Output from all_appended.xlsx
Related
I scraped a html table from yahoofinance website and tried to export the table to csv file. However, it does not return the correct output in the csv file. The printed output on my terminal appears to be just fine. What have I done wrong here?
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
mystocks = ["XOM", "CVX", "COP", "EOG"]
stockdata = []
def getData(symbol):
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"}
url = f"https://finance.yahoo.com/quote/{symbol}/key-statistics"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
print("Ticker - "+symbol)
for t in soup.select("table"):
for tr in t.select("tr:has(td)"):
for sup in tr.select("sup"):
sup.extract()
stockdata = [td.get_text(strip=True) for td in tr.select("td")]
if len(stockdata) == 2:
print("{:<50} {}".format(*stockdata))
for item in mystocks:
stockdata.append(getData(item))
df = pd.DataFrame(stockdata)
df.to_csv('file_name.csv')
You are printing, not returning the data.
If you want all the data in one table it is good to add a column with the symbol for which the row was originated. You could use something like this
import requests
from bs4 import BeautifulSoup
import csv
import pandas as pd
mystocks = ["XOM", "CVX", "COP", "EOG"]
stockdata = []
def getData(symbol):
headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0"}
url = f"https://finance.yahoo.com/quote/{symbol}/key-statistics"
soup = BeautifulSoup(requests.get(url, headers=headers).content, "html.parser")
print("Ticker - "+symbol)
for t in soup.select("table"):
for tr in t.select("tr:has(td)"):
for sup in tr.select("sup"):
sup.extract()
stockdata = [td.get_text(strip=True) for td in tr.select("td")]
if len(stockdata) == 2:
# add a column with the symbol to help affterwards
yield [item] + stockdata
# this will concatenate the rows for all the symbols in mystocks
df = pd.DataFrame([r for item in mystocks for r in getData(item)])
df.to_csv('file_name.csv')
In this URL https://doc8643.com/aircrafts I want to scrape all rows.
Then for each individual row, for example https://doc8643.com/aircraft/A139
I want to scrape these three areas of data
<table class="table centered-table">
<h4>Manufacturers</h4>
<h4>Technical Data</h4>
Can this is done in python?
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://doc8643.com/aircrafts'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
writer.writerow([c.text if c.text else '' for c in row.select('h3')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
else:
break
You should create function which get value c.text (ie, A139) and creates full url like https://doc8643.com/aircraft/A139 and runs Request or requests and BeautifulSoup to get all needs data
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# ... scrape details and put in list `results` ...
return results
and run it in your loop
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
The biggest problem is to scrape details.
For some details it needs to scrape dl and next all dt and dd and use zip() to group in pairs.
Something like
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
but this need more code - and I skip this part.
Minimal working code
EDIT: I added url = 'https://doc8643.com' + url
import csv
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# --- functions ---
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
# --- main ---
url = 'https://doc8643.com/aircrafts'
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["data1", "data2", "data3", "etc..."])
while True:
print('url:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
url = 'https://doc8643.com' + url
else:
break
BTW:
Maybe it would be better to keep results as dictionary
results[dt.string] = [dd.string]
I am trying to scrape:
https://id.investing.com/commodities/gold-historical-data
table from 2010-2020, but the problem is the link between the default date and the date that I chose is still the same. So how can I tell python to scrape data from 2010-2020? please help me I'm using python 3.
This is my code:
import requests, bs4
url = 'https://id.investing.com/commodities/gold-historical-data'
headers = {"User-Agent":"Mozilla/5.0"}
response = requests.get(url, headers=headers)
soup = bs4.BeautifulSoup(response.text, 'lxml')
tables = soup.find_all('table')
print(soup)
with open('emasfile.csv','w') as csv:
for row in tables[1].find_all('tr'):
line = ""
for td in row.find_all(['td', 'th']):
line += '"' + td.text + '",'
csv.write(line + '\n')
This page uses JavaScript with AJAX to get data from
https://id.investing.com/instruments/HistoricalDataAjax
It sends POST requests with extra data - start date and end date ("st_date", "end_date")
You can try to use 01/01/2010, 12/31/2020 but I used for-loop to get every year separatelly.
I get all information from DevTool (tab 'Network') in Chrome/Firefox.
import requests
from bs4 import BeautifulSoup
import csv
url = 'https://id.investing.com/instruments/HistoricalDataAjax'
payload = {
"curr_id": "8830",
"smlID": "300004",
"header": "Data+Historis+Emas+Berjangka",
"st_date": "01/30/2020",
"end_date": "12/31/2020",
"interval_sec": "Daily",
"sort_col": "date",
"sort_ord": "DESC",
"action":"historical_data"
}
headers = {
#"Referer": "https://id.investing.com/commodities/gold-historical-data",
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:80.0) Gecko/20100101 Firefox/80.0",
"X-Requested-With": "XMLHttpRequest"
}
fh = open('output.csv', 'w')
csv_writer = csv.writer(fh)
for year in range(2010, 2021):
print('year:', year)
payload["st_date"] = f"01/01/{year}"
payload["end_date"] = f"12/31/{year}"
r = requests.post(url, data=payload, headers=headers)
#print(r.text)
soup = BeautifulSoup(r.text, 'lxml')
table = soup.find('table')
for row in table.find_all('tr')[1:]: # [1:] to skip header
row_data = [item.text for item in row.find_all('td')]
print(row_data)
csv_writer.writerow(row_data)
fh.close()
I have written the following code to get data from each product on the website https://www.1800wheelchair.com/category/369/transport-wheelchairs/?p=3, but there seems to be something wrong with it. It does not give any error, but also doesnt give the required output. My guess is that I am extracting the link of each product incorrectly, that is why it is not printing in output panel. I have been trying really long to figure this out, but not able to.
import requests
import xlsxwriter
from bs4 import BeautifulSoup
def cpap_spider(max_pages):
global row_i
page=1
while page<=max_pages:
url= "https://www.1800wheelchair.com/category/369/transport-wheelchairs/?p=" +str(page)
source_code= requests.get(url)
plain_text= source_code.text
soup= BeautifulSoup(plain_text, 'html.parser')
for link in soup.findAll("h2", {"class":"product-name"}):
href=link.find("a")['href']
title = link.string
#worksheet.write(row_i, 0, title)
#each_item(href)
print(href)
#print(title)
page+=1
def each_item(item_url):
global cols_names, row_i
source_code= requests.get(item_url)
plain_text= source_code.text
soup= BeautifulSoup(plain_text, 'html.parser')
table=soup.find("table", {"class":"specifications "})
if table:
table_rows = table.find_all('tr')
else:
return
for row in table_rows:
cols = row.select('td')
for ele in range(0,len(cols)):
temp = cols[ele].text.strip()
if temp:
if temp[-1:] == ":":
temp = temp[:-1]
# Name of column
if ele == 0:
try:
cols_names_i = cols_names.index(temp)
except:
cols_names.append(temp)
cols_names_i = len(cols_names) - 1
worksheet.write(0, cols_names_i + 1, temp)
continue;
worksheet.write(row_i, cols_names_i + 1, temp)
row_i += 1
cols_names=[]
cols_names_i = 0
row_i = 1
workbook = xlsxwriter.Workbook('st.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write(0, 0, "Title")
cpap_spider(3)
workbook.close()
To obtain the correct results, set User-Agent HTTP header along your request:
import requests
from bs4 import BeautifulSoup
url = 'https://www.1800wheelchair.com/category/369/transport-wheelchairs/?p=3'
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0'
}
soup = BeautifulSoup(requests.get(url, headers=headers).content, 'html.parser')
for idx, a in enumerate(soup.select('a[itemprop="url"].button'), 1):
print('{:<3} {}'.format(idx, a['href']))
Prints:
1 https://www.1800wheelchair.com/product/22-bariatric-aluminum-transport-chair/
2 https://www.1800wheelchair.com/product/lightweight-bariatric-transport-chair-63523/
3 https://www.1800wheelchair.com/product/medline-bariatric-transport-chair-with-12-rear-wheels/
4 https://www.1800wheelchair.com/product/karman-t-900-extra-wide-transport-wheelchair/
5 https://www.1800wheelchair.com/product/excel-freedom-plus-bariatric-transport-chair/
6 https://www.1800wheelchair.com/product/karman-removable-arm-transport-chair/
I'm trying to scrape through the table at https://bgp.he.net/report/world. I would like to go through each of the HTML links going to country pages, then grab the data and then iterate to the next list. I'm using beautiful soup and can already grab the data the I want, but can't quite figure out how to iterate through the column of HTMLs.
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/country/LC"
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
with open ('table_attempt.txt', 'w') as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")
print(data)
I would like to be able to gather the data from each country into one written text file.
I only tested this with the first 3 links (got one error with UnicodeEncodeError but fixed that and commented where that was in the code).
from bs4 import BeautifulSoup
import requests
import json
#First get the list of countries urls
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
# Go through each row and grab the link. If there's no link, continue to next row
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
# Now iterate through that list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
print ('Writing from %s' %(url))
# I added encoding="utf-8" because of an UnicodeEncodeError:
with open ('table_attempt.txt', 'w', encoding="utf-8") as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")
You can iterate over the main table, and send a request to scrape the "report" listing:
import requests, re
from bs4 import BeautifulSoup as soup
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
def scrape_report(_id):
_d = soup(requests.get(f'https://bgp.he.net/country/{_id}', headers=headers).text, 'html.parser')
_headers = [i.text for i in _d.find_all('th')]
_, *data = [[i.text for i in b.find_all('td')] for b in _d.find_all('tr')]
return [dict(zip(_headers, i)) for i in data]
d = soup(requests.get('https://bgp.he.net/report/world', headers=headers).text, 'html.parser')
_, *_listings = [[re.sub('[\t\n]+', '', i.text) for i in b.find_all('td')] for b in d.find_all('tr')]
final_result = [{**dict(zip(['Name', 'Country', 'ASN'], [a, b, c])), 'data':scrape_report(b)} for a, b, c, *_ in _listings]
import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
#sorting through table
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
#Grabbing urls from table
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
Total_URLs= len(country_urls)
print(Total_URLs, "counties to pull data from")
print("\n")
#Creating text file
with open('table_attempt.txt', 'w', encoding="utf-8") as r:
json.dumps([])
#Looping through country url list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
#Taking country identifier from url list
country_ID = (url[-2:])
soup = BeautifulSoup(html.text, 'html.parser')
data = []
i=0
Total_URLs -= 1
#appending to file
with open('ASN_Info.txt', 'a', encoding="utf-8") as r:
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
json.dump(data[i], r)
i += 1
r.write("\n")
print('Currently writing from data from %s. %s countries left to pull data from.' %(country_ID, Total_URLs))