Using Python, BeautifulSoup, CSV to scrape a URL

Using Python, BeautifulSoup, CSV to scrape a URL - python

In this URL https://doc8643.com/aircrafts I want to scrape all rows.
Then for each individual row, for example https://doc8643.com/aircraft/A139
I want to scrape these three areas of data
<table class="table centered-table">
<h4>Manufacturers</h4>
<h4>Technical Data</h4>
Can this is done in python?
import requests, csv
from bs4 import BeautifulSoup
from urllib.request import Request
url = 'https://doc8643.com/aircrafts'
req = Request(url , headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'})
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
while True:
print(url)
html = requests.get(url)
soup = BeautifulSoup(html.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
writer.writerow([c.text if c.text else '' for c in row.select('h3')])
print(row)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
else:
break

You should create function which get value c.text (ie, A139) and creates full url like https://doc8643.com/aircraft/A139 and runs Request or requests and BeautifulSoup to get all needs data
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# ... scrape details and put in list `results` ...
return results
and run it in your loop
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
The biggest problem is to scrape details.
For some details it needs to scrape dl and next all dt and dd and use zip() to group in pairs.
Something like
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
but this need more code - and I skip this part.
Minimal working code
EDIT: I added url = 'https://doc8643.com' + url
import csv
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'}
# --- functions ---
def scrape_details(number):
url = 'https://doc8643.com/aircraft/' + number
print('details:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
results = []
all_dl = soup.find_all('dl')
for item in all_dl:
all_dt = item.find_all('dt')
all_dd = item.find_all('dd')
for dt, dd in zip(all_dt, all_dd):
pair = f"{dt.string}: {dd.string}"
results.append(pair)
print(pair)
#print(results)
return results
# --- main ---
url = 'https://doc8643.com/aircrafts'
with open('doc8643.csv', "w", encoding="utf-8") as f:
writer = csv.writer(f)
writer.writerow(["data1", "data2", "data3", "etc..."])
while True:
print('url:', url)
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
# Go throught table = tbody and extract the data under the 'td' tag
for row in soup.select('ul.nav.nav-pills.nav-stacked li.aircraft_item'):
data = [c.text if c.text else '' for c in row.select('h3')]
for item in data:
values = scrape_details(item)
writer.writerow([item] + values)
# If more than one page then iterate through all of them
if soup.select_one('ul.pagination li.active + li a'):
url = soup.select_one('ul.pagination li.active + li a')['href']
url = 'https://doc8643.com' + url
else:
break
BTW:
Maybe it would be better to keep results as dictionary
results[dt.string] = [dd.string]

Related

Python Beautifulsoup scraping script unpacking, hardcoding and duplication

I'm practising some Python scraping and I'm a bit stuck with the following exercise. The aim is to scrape the tickers resulting when applying some filters. Code below:
tickers = []
counter = 1
while True:
url = ("https://finviz.com/screener.ashx?v=111&f=cap_large&r="+ str(counter))
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
webpage = urlopen(req).read()
html = soup(webpage, "html.parser")
rows = html.select('table[bgcolor="#d3d3d3"] tr')
for i in rows[1:]:
a1, a2, a3, a4 = (x.text for x in i.find_all('td')[1:5])
i = a1
tickers.append(i)
counter+=20
if tickers[-1]==tickers[-2]:
break
I'm not sure how to extract only 1 column so I'm using the code for all them (a1, a2, a3, a4 = (x.text for x in i.find_all('td')[1:5])), is there a way just to get the first column?
Is there a way to avoid having to hardcode '20' in the script?
When I run the code it creates a duplicate of the last ticker, is there another way to make the code stop when it went through all the entries?

So you are only interested in the values of tickers column, select it more specific - Based on its content the <a>:
html.select('table[bgcolor="#d3d3d3"] a.screener-link-primary')
To avoid working with the hardcoded 20 just take a look if there is a next page element and use its href:
html.select_one('.tab-link:-soup-contains("next")')
Example
import requests,time
from bs4 import BeautifulSoup
url = "https://finviz.com/screener.ashx?v=111&f=cap_large"
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36','accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}
tickers = []
while True:
r = requests.get(url, headers=headers)
html = BeautifulSoup(r.text, "html.parser")
for a in html.select('table[bgcolor="#d3d3d3"] a.screener-link-primary'):
tickers.append(a.text)
if html.select_one('.tab-link:-soup-contains("next")'):
url = "https://finviz.com/"+html.select_one('.tab-link:-soup-contains("next")')['href']
else:
break
# be kind and add some delay between your requests
time.sleep(1)
tickers

You can use nth-child range to filter out first row in table, then nth-child(2) to get the tickers column within the remaining table rows
tickers = [td.text for td in html.select('table[bgcolor="#d3d3d3"] tr:nth-child(n+2) td:nth-child(2)')]
With an existing list use
tickers.extend([td.text for td in html.select('table[bgcolor="#d3d3d3"] tr:nth-child(n+2) td:nth-child(2)')])
Read about nth-child here:
http://nthmaster.com/
and
https://developer.mozilla.org/en-US/docs/Web/CSS/:nth-child
You can stop when there is no more "next" present. counter needs to increment by 20 each request.
import requests
from bs4 import BeautifulSoup as bs
tickers = []
counter = 1
with requests.Session() as s:
s.headers = {'User-Agent':'Mozilla/5.0'}
while True:
# print(counter)
url = ("https://finviz.com/screener.ashx?v=111&f=cap_large&r="+ str(counter))
res = s.get(url)
html = bs(res.text, "html.parser")
tickers.extend([td.text for td in html.select('table[bgcolor="#d3d3d3"] tr:nth-child(n+2) td:nth-child(2)')])
if html.select_one('.tab-link b:-soup-contains("next")') is None:
break
counter+=20

Pagination link are repetitive in my BeautfiulSoup Python Code

from bs4 import BeautifulSoup
import requests
import csv
class Parse():
def __init__(self):
self.row_list = []
self.base_url ='https://www.tripadvisor.co.uk'
def parse(self,url): # correct
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36 Edg/90.0.818.51'}
response = requests.get(url,headers).text
soup = BeautifulSoup(response,'html.parser')
next_link = soup.find('a',class_='_23XJjgWS _1hF7hP_9 _2QvUxWyA')
next_page = self.base_url+next_link.attrs['href']
cards = soup.find_all('section',class_='_2TabEHya _3YhIe-Un')
for card in cards:
name = card.find('div',class_='_1gpq3zsA _1zP41Z7X').text
rating = str(card.find('svg',class_='zWXXYhVR'))
rating = self.remove(filter_col=rating)
review_count = card.find('span',class_='DrjyGw-P _26S7gyB4 _14_buatE _1dimhEoy').text
status = card.find('div',class_='DrjyGw-P _26S7gyB4 _3SccQt-T').text
row_list = [name,rating,status,review_count]
return next_page,row_list
def remove(self,filter_col):
rating = filter_col.split(' ')[1]
rating = rating[-3:]
return rating
def write_csv(self,row_list):
with open('top_sites.csv','w') as file:
csv_writer = csv.writer(file, delimiter=',')
csv_writer.writerows(row_list)
if __name__=='__main__':
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html"
parsing = Parse()
next_url,row_list = parsing.parse(url=url)
print(next_url)
PS C:\Users\Caspe\PycharmProjects\Selenium Test> & "c:/Users/Caspe/PycharmProjects/Selenium Test/.venv/Scripts/python.exe" "c:/Users/Caspe/PycharmProjects/Selenium Test/Demo/tripadvisor_topattract.py"
https://www.tripadvisor.co.uk/Attractions-g294190-Activities-Myanmar.html
PS C:\Users\Caspe\PycharmProjects\Selenium Test>
I'm trying to scrape data from TripAdvisor Website using BeautifulSoup.
Link: https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa30-Myanmar.html
Instead of going to next page, the link is repeated itself. Is there a solution for my problem?
I've selected the correct selector for the soup and I was able to scrape data.

To get pagination working, it's necessary to change the -oa<index>- part in URL:
import csv
import requests
from bs4 import BeautifulSoup
url = "https://www.tripadvisor.co.uk/Attractions-g294190-Activities-oa{}-Myanmar.html"
data = []
for page in range(0, 4): # <--- increase page count here
print("Getting page {}..".format(page))
soup = BeautifulSoup(
requests.get(url.format(page * 30)).content, "html.parser"
)
titles = soup.select('span[name="title"]')
for title in titles:
no, t = title.get_text(strip=True, separator="|").split("|")
rating = title.find_next("svg")
review_count = rating.find_next("span")
data.append(
(
no,
t,
rating["title"],
review_count.text,
review_count.find_next(
"div", class_="DrjyGw-P _26S7gyB4 _3SccQt-T"
).text,
)
)
with open("data.csv", "w") as f_out:
w = csv.writer(f_out)
w.writerows(data)
Writes data.csv (screenshot from LibreOffice):

How to scrape the data from "https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days"

I am trying to scrape the table from the "https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days" website, where as the python output is scraping the table.
import requests
from bs4 import BeautifulSoup
url = 'https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36'}
response = requests.get(url, headers=headers)
print(response)
soup = BeautifulSoup(response.text, 'lxml')
print(soup)
data_array = soup.find(id='table-wrap my-3 borderSet maxHeight-900 scrollWrap').get_text().strip().split(":")
type(data_array)
the output is printing the HTML tag instead of the table.
Regards
karthi

If you want table, there's a download link available. It's available as a csv file. You don't need any code. Why don't you just use that?

this code will return you all table as list, put data_table as locator by xpath:
data_table = self.find_element(table_locator).get_attribute('innerHTML').replace('<th></th>', '')
soup = BeautifulSoup(data_table, 'lxml')
data_rows = soup.find_all('tr')
rows_values_scrape = [[td.getText() for td in data_rows[i].findAll('td')]
for i, v in enumerate(data_rows)]
rows_values = [x for x in rows_values_scrape if x]
columns_scrape = [[td.getText() for td in data_rows[i].findAll('th')]
for i, v in enumerate(data_rows)]
columns = [x for x in columns_scrape if x]
table=[]
if columns[1:] != []:
for i, r in enumerate(columns[1:]):
table.append([f'column: {columns[0][j]}, row_title: {columns[1:][i][0]}, cell: {rows_values[i][j]}' for j, c in enumerate(columns[0])])
else:
table=[f'column: {columns[0][j]}, cell: {rows_values[0][j]}' for j, c in enumerate(columns[0]) if columns[1:] == []]
return table

How do I search within a website using the 'requests' module?

I want to search for different company names on the website. Website link: https://www.firmenwissen.de/index.html
On this website, I want to use the search engine and search companies. Here is the code I am trying to use:
from bs4 import BeautifulSoup as BS
import requests
import re
companylist = ['ABEX Dachdecker Handwerks-GmbH']
url = 'https://www.firmenwissen.de/index.html'
payloads = {
'searchform': 'UFT-8',
'phrase':'ABEX Dachdecker Handwerks-GmbH',
"mainSearchField__button":'submit'
}
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
html = requests.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'html.parser')
link_list= []
links = soup.findAll('a')
for li in links:
link_list.append(li.get('href'))
print(link_list)
This code should bring me the next page with company information. But unfortunately, it returns only the home page. How can I do this?

Change your initial url you are doing search for. Grab the appropriate hrefs only and add to a set to ensure no duplicates (or alter selector to return only one match if possible); add those items to a final set for looping to ensure only looping required number of links. I have used Session on assumption you will repeat for many companies.
Iterate over the set using selenium to navigate to each company url and extract whatever info you need.
This is an outline.
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = set()
## searches section; gather into set
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLinks = {baseUrl + item['href'] for item in soup.select("[href*='firmeneintrag/']")}
# print(soup.select_one('.fp-result').text)
finalLinks = finalLinks.union(companyLinks)
for item in finalLinks:
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()
Just the first links:
from bs4 import BeautifulSoup as BS
import requests
from selenium import webdriver
d = webdriver.Chrome()
companyList = ['ABEX Dachdecker Handwerks-GmbH','SUCHMEISTEREI GmbH', 'aktive Stuttgarter']
url = 'https://www.firmenwissen.de/ergebnis.html'
baseUrl = 'https://www.firmenwissen.de'
headers = {'User-Agent': 'Mozilla/5.0'}
finalLinks = []
## searches section; add to list
with requests.Session() as s:
for company in companyList:
payloads = {
'searchform': 'UFT-8',
'phrase':company,
"mainSearchField__button":'submit'
}
html = s.post(url, data=payloads, headers=headers)
soup = BS(html.content, 'lxml')
companyLink = baseUrl + soup.select_one("[href*='firmeneintrag/']")['href']
finalLinks.append(companyLink)
for item in set(finalLinks):
d.get(item)
info = d.find_element_by_css_selector('.yp_abstract_narrow')
address = d.find_element_by_css_selector('.yp_address')
print(info.text, address.text)
d.quit()

How do you iterate through HTML links in table to pull data from tables?

I'm trying to scrape through the table at https://bgp.he.net/report/world. I would like to go through each of the HTML links going to country pages, then grab the data and then iterate to the next list. I'm using beautiful soup and can already grab the data the I want, but can't quite figure out how to iterate through the column of HTMLs.
from bs4 import BeautifulSoup
import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/country/LC"
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
with open ('table_attempt.txt', 'w') as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")
print(data)
I would like to be able to gather the data from each country into one written text file.

I only tested this with the first 3 links (got one error with UnicodeEncodeError but fixed that and commented where that was in the code).
from bs4 import BeautifulSoup
import requests
import json
#First get the list of countries urls
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
# Go through each row and grab the link. If there's no link, continue to next row
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
# Now iterate through that list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
country_ID = (url[-2:])
print("\n")
soup = BeautifulSoup(html.text, 'html.parser')
#print(soup)
data = []
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
i = 0
print ('Writing from %s' %(url))
# I added encoding="utf-8" because of an UnicodeEncodeError:
with open ('table_attempt.txt', 'w', encoding="utf-8") as r:
for item in data:
r.write(str(data[i]))
i += 1
r.write("\n")

You can iterate over the main table, and send a request to scrape the "report" listing:
import requests, re
from bs4 import BeautifulSoup as soup
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
def scrape_report(_id):
_d = soup(requests.get(f'https://bgp.he.net/country/{_id}', headers=headers).text, 'html.parser')
_headers = [i.text for i in _d.find_all('th')]
_, *data = [[i.text for i in b.find_all('td')] for b in _d.find_all('tr')]
return [dict(zip(_headers, i)) for i in data]
d = soup(requests.get('https://bgp.he.net/report/world', headers=headers).text, 'html.parser')
_, *_listings = [[re.sub('[\t\n]+', '', i.text) for i in b.find_all('td')] for b in d.find_all('tr')]
final_result = [{**dict(zip(['Name', 'Country', 'ASN'], [a, b, c])), 'data':scrape_report(b)} for a, b, c, *_ in _listings]

import requests
import json
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:56.0) Gecko/20100101 Firefox/56.0'}
url = "https://bgp.he.net/report/world"
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'html.parser')
#sorting through table
table = soup.find('table', {'id':'table_countries'})
rows = table.find_all('tr')
country_urls = []
#Grabbing urls from table
for row in rows:
try:
link = row.select('a')[0]['href']
country_urls.append(link)
except:
continue
Total_URLs= len(country_urls)
print(Total_URLs, "counties to pull data from")
print("\n")
#Creating text file
with open('table_attempt.txt', 'w', encoding="utf-8") as r:
json.dumps([])
#Looping through country url list
for link in country_urls:
url = "https://bgp.he.net" + link
html = requests.get(url, headers=headers)
#Taking country identifier from url list
country_ID = (url[-2:])
soup = BeautifulSoup(html.text, 'html.parser')
data = []
i=0
Total_URLs -= 1
#appending to file
with open('ASN_Info.txt', 'a', encoding="utf-8") as r:
for row in soup.find_all("tr")[1:]: # start from second row
cells = row.find_all('td')
data.append({
'ASN': cells[0].text,
'Country': country_ID,
"Name": cells[1].text,
"Routes V4": cells[3].text,
"Routes V6": cells[5].text
})
json.dump(data[i], r)
i += 1
r.write("\n")
print('Currently writing from data from %s. %s countries left to pull data from.' %(country_ID, Total_URLs))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Using Python, BeautifulSoup, CSV to scrape a URL - python

Related

Python Beautifulsoup scraping script unpacking, hardcoding and duplication

Pagination link are repetitive in my BeautfiulSoup Python Code

How to scrape the data from "https://www.nseindia.com/companies-listing/corporate-filings-event-calendar?days=7days"

How do I search within a website using the 'requests' module?

How do you iterate through HTML links in table to pull data from tables?

Categories

Resources