formatting data to csv file - python

I wrote this page scraper using python and beautiful soup to extract data from a table and now want to save it. The area i scraped is the table on the right hand side of the website. I need the bold part on the left side to correspond to the right side, so key people to correspond to ceo for example. New to this, need some advice on the best way to format this. Thank you.
import requests
import csv
from datetime import datetime
from bs4 import BeautifulSoup
# download the page
myurl = requests.get("https://en.wikipedia.org/wiki/Goodyear_Tire_and_Rubber_Company")
# create BeautifulSoup object
soup = BeautifulSoup(myurl.text, 'html.parser')
# pull the class containing all tire name
name = soup.find(class_ = 'logo')
# pull the div in the class
nameinfo = name.find('div')
# just grab text inbetween the div
nametext = nameinfo.text
# print information about goodyear logo on wiki page
#print(nameinfo)
# now, print type of company, private or public
#status = soup.find(class_ = 'category')
#for link in soup.select('td.category a'):
#print link.text
# now get the ceo information
#for employee in soup.select('td.agent a'):
#print employee.text
# print area served
#area = soup.find(class_ = 'infobox vcard')
#print(area)
# grab information in bold on the left hand side
vcard = soup.find(class_ = 'infobox vcard')
rows = vcard.find_all('tr')
for row in rows:
cols=row.find_all('th')
cols=[x.text.strip() for x in cols]
print cols
# grab information in bold on the right hand side
vcard = soup.find(class_ = 'infobox vcard')
rows = vcard.find_all('tr')
for row in rows:
cols2=row.find_all('td')
cols2=[x.text.strip() for x in cols2]
print cols2
# save to csv file named index
with open('index.csv', 'w') as csv_file:
writer = csv.writer(csv_file) # actually write to the file
writer.writerow([cols,cols2 , datetime.now()]) # apprend time

You need to reorder your code a bit. It is also possible to find both tr and th at the same time which would solve your problem of the two columns needing to be in sync:
import requests
import csv
from datetime import datetime
from bs4 import BeautifulSoup
myurl = requests.get("https://en.wikipedia.org/wiki/Goodyear_Tire_and_Rubber_Company")
soup = BeautifulSoup(myurl.text, 'html.parser')
vcard = soup.find(class_='infobox vcard')
with open('output.csv', 'wb') as f_output:
csv_output = csv.writer(f_output)
for row in vcard.find_all('tr')[1:]:
cols = row.find_all(['th', 'td'])
csv_output.writerow([x.text.strip().replace('\n', ' ').encode('ascii', 'ignore') for x in cols] + [datetime.now()])
This would create an output.csv file such as:
Type,Public,2018-03-27 17:12:45.146000
Tradedas,NASDAQ:GT S&P 500 Component,2018-03-27 17:12:45.147000
Industry,Manufacturing,2018-03-27 17:12:45.147000
Founded,"August29, 1898; 119 years ago(1898-08-29) Akron, Ohio, U.S.",2018-03-27 17:12:45.147000
Founder,Frank Seiberling,2018-03-27 17:12:45.147000
Headquarters,"Akron, Ohio, U.S.",2018-03-27 17:12:45.148000
Area served,Worldwide,2018-03-27 17:12:45.148000
Key people,"Richard J. Kramer (Chairman, President and CEO)",2018-03-27 17:12:45.148000
Products,Tires,2018-03-27 17:12:45.148000
Revenue,US$ 15.158 billion[1](2016),2018-03-27 17:12:45.149000
Operating income,US$ 1.52 billion[1](2016),2018-03-27 17:12:45.149000
Net income,US$ 1.264 billion[1](2016),2018-03-27 17:12:45.149000
Total assets,US$ 16.511 billion[1](2016),2018-03-27 17:12:45.150000
Total equity,US$ 4.507 billion[1](2016),2018-03-27 17:12:45.150000
Number of employees,"66,000[1](2017)",2018-03-27 17:12:45.150000
Subsidiaries,List of subsidiaries,2018-03-27 17:12:45.151000
Website,goodyear.com,2018-03-27 17:12:45.151000

Related

I am trying to click on expand button and then scrape the table

I am scraping a website table form https://csr.gov.in/companyprofile.php?year=FY+2015-16&CIN=L00000CH1990PLC010573 but I am not getting the exact result I am looking for. I want 11 columns from this link, "company name", "Class", "State", "Company Type", "RoC", "Sub Category", "Listing Status". These are 7 columns and after that you can see an expand button " CSR Details of FY 2017-18" when you will click on that button you will get 4 more columns "Average Net Profit", "CSR Prescribed Expenditure", "CSR Spent", "Local Area Spent". I want all these columns in csv file. I wrote a code and it is not working properly. I am attaching an Image of result for refference. and here is my code. please help to get these data.
from selenium import webdriver
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
import csv
driver = webdriver.Chrome()
url_file = "csrdata.txt"
with open(url_file, "r") as url:
url_pages = url.read()
# we need to split each urls into lists to make it iterable
pages = url_pages.split("\n") # Split by lines using \n
data = []
# now we run a for loop to visit the urls one by one
for single_page in pages:
driver.get(single_page)
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
driver.find_element_by_link_text("CSR Details of FY 2017-18").click()
table = driver.find_elements_by_xpath("//*[contains(#id,'colfy4')]")
about = table.__getitem__(0).text
x = about.split('\n')
print(x)
data.append(x)
df = pd.DataFrame(data)
print(df)
# write to csv
df.to_csv('csr.csv')
You dont need to use selenium since all the informations are inside the html code. Also you can use pandas inbuild function pd_read_html() to directly transform the html-table into a dataframe.
data = []
for single_page in pages:
r = requests.get(single_page)
soup = BeautifulSoup(r.content, 'html5lib')
table = soup.find_all('table') #finds all tables
table_top = pd.read_html(str(table))[0] #the top table
try: #try to get the other table if exists
table_extra = pd.read_html(str(table))[7]
except:
table_extra = pd.DataFrame()
result = pd.concat([table_top, table_extra])
data.append(result)
pd.concat(data).to_csv('test.csv')
output:
0 1
0 Class Public
1 State Chandigarh
2 Company Type Other than Govt.
3 RoC RoC-Chandigarh
4 Sub Category Company limited by shares
5 Listing Status Listed
0 Average Net Profit 0
1 CSR Prescribed Expenditure 0
2 CSR Spent 0
3 Local Area Spent 0

BeautifulSoup joining two tables(rows) to generate the csv file

from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen(
"https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm")
bsObj = BeautifulSoup(html, "lxml")
table = bsObj.find('table', id="cont")
rows = table.findAll("tr")
links = [a['href'] for a in table.find_all('a', href=True) if a.text]
new_links = []
for link in links:
new_links.append(("https://www.accessdata.fda.gov/scripts/drugshortages/"+link).replace(" ", "%20"))
href_rows = []
for link in new_links:
link = link.replace("®", "%C2%AE")
html = urlopen(link)
bsObj_href = BeautifulSoup(html, "lxml")
#bsObj_href = BeautifulSoup (html.decode('utf-8', 'ignore'))
div_href = bsObj_href.find("div",{"id":"accordion"})
href_rows.append(div_href.findAll("tr"))
csvFile = open("drug_shortage.csv", 'wt', newline='')
writer = csv.writer(csvFile)
try:
for row in rows:
csvRow = []
for cell in row.findAll(['td', 'th']):
csvRow.append(cell.get_text())
writer.writerow(csvRow)
finally:
csvFile.close()
Hello, so I created two rows like that. If you go to the this website https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm
they have the drug name and status column and when you click the drug name you can find four more columns. I like to combine together(based on drug name) in order
So It would be drug name,status, Presentation, Availability, and Estimated Shortage Duration,Related Information, Shortage Reason (per FDASIA).
But current codes only generate for the first one(drug names,status).
I tried
for row in rows,rows_href:
but then I get AttributeError: ResultSet object has no attribute 'findAll'. I get the same error for
for row in rows_href:
Any suggestion how do I generate as I wanted?
Your code is too chaotic.
You get all rows, next all links, and next you try to get all other information but this way you can't control which values to join in row. The biggest problem will be when some row will not have data on subpage and all your data will move to one row up.
You should get all rows from table on main page and then use for-loop to work with every row separatelly to get other elements only for this single row - read link only for this row, get data from subpage only for this row, etc. and put all data for this row on list as sublist [name, status, link, presentation, availability, related, reason]. And after that you get next work and work only with data for next row.
BTW: because subpage may have many rows so I create many rows in data with the same name, status but with different other values
[name, status, values from first row on subpage]
[name, status, values from second row on subpage]
[name, status, values from string row on subpage]
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
html = urlopen("https://www.accessdata.fda.gov/scripts/drugshortages/default.cfm")
bsObj = BeautifulSoup(html, "lxml")
# list for all rows with all values
data = []
# get table on main page
table = bsObj.find('table', {'id': 'cont'})
# work with every row separatelly
for row in table.find_all("tr")[1:]: # use `[1:]` to skip header
# get columns only in this row
cols = row.find_all('td')
# get name and url from first column
link = cols[0].find('a', href=True)
name = link.text.strip()
url = link['href']
url = "https://www.accessdata.fda.gov/scripts/drugshortages/" + url
url = url.replace(" ", "%20").replace("®", "%C2%AE")
print('name:', name)
print('url:', url)
# get status from second column
status = cols[1].text.strip()
print('status:', status)
# subpage
html = urlopen(url)
bsObj_href = BeautifulSoup(html, "lxml")
subtable = bsObj_href.find("table")
if not subtable:
data.append([name, status, link, '', '', '', ''])
print('---')
else:
for subrows in subtable.find_all('tr')[1:]: # use `[1:]` to skip header
#print(subrows)
subcols = subrows.find_all('td')
presentation = subcols[0].text.strip()
availability = subcols[1].text.strip()
related = subcols[2].text.strip()
reason = subcols[3].text.strip()
data.append([name, status, link, presentation, availability, related, reason])
print(presentation, availability, related, reason)
print('---')
print('----------')
with open("drug_shortage.csv", 'wt', newline='') as csvfile:
writer = csv.writer(csvFile)
# write header - one row - using `writerow` without `s` at the end
#writer.writerow(['Name', 'Status', 'Link', 'Presentation', 'Availability', 'Related', 'Reason'])
# write data - many rowr - using `writerows` with `s` at the end
writer.writerows(data)
# no need to close because it use `with`

bs4 python extracting value from <span></span> to .csv printing the same result over and over

I have managed to build a very primitive program to scrape vehicle data from pistonheads and print it to a .csv file with the link, make, model and am working on getting the price which is where I am encountering a problem.
I want to scrape the prices to the fourth column in my .csv file (Price) and to correctly print the prices from each vehicle on the website.
I am only getting it to print the price from one vehicle and repeat it again and again next to each vehicle in the .csv file.
I have tried soup.findAll and soup.find_all to see whether parsing through multiple elements would work but this is just creating a bigger mess.
Might someone be able to help?
I am also trying to scrape the image src and would like to print that on another column (5) called images.
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price"])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = soup.find('span')
writer.writerow([link, make, model, price])
print(link, make, model, price)
outfile.close()
You can try this:
import csv, requests, re
from urllib.parse import urlparse
from bs4 import BeautifulSoup as soup
d = soup(requests.get('https://www.pistonheads.com/classifieds?Category=used-cars&ResultsPerPage=100').text, 'html.parser')
def extract_details(_s:soup) -> list:
_link = _s.find('a', {'href':re.compile('/classifieds/used\-cars/')})['href']
_, _, make, model, *_ = _link[1:].split('/')
price, img = _s.find('div', {'class':'price'}).text, [i['src'] for i in _s.find_all('img')]
return [_link, make, model, price, 'N/A' if not img else img[0]]
with open('filename.csv', 'w') as f:
_listings = [extract_details(i) for i in d.find_all('div', {'class':'ad-listing'}) if i.find('div', {'class':'price'})]
write = csv.writer(f)
write.writerows([['make', 'model', 'price', 'img'], *_listings])
The reason is because of price = soup.find('span')
.find() will grab the first element it finds. And you have it looking into your soup object. But where you want it to look, is within your a, because that's what you are looping through with for a in links:
I also add .text as I am assuming you just want the text, not the whole tag element. Ie price = a.find('span').text
import csv ; import requests
from bs4 import BeautifulSoup
outfile = open('pistonheads.csv','w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Link", "Make", "Model", "Price", 'Images'])
url = 'https://www.pistonheads.com/classifieds?Category=used-cars&Page=1&ResultsPerPage=100'
get_url = requests.get(url)
get_text = get_url.text
soup = BeautifulSoup(get_text, 'html.parser')
car_link = soup.find_all('div', 'listing-headline', 'price')
for div in car_link:
links = div.findAll('a')
for a in links:
link = ("https://www.pistonheads.com" + a['href'])
make = (a['href'].split('/')[-4])
model = (a['href'].split('/')[-3])
price = a.find('span').text
image_link = a.parent.parent.find('img')['src']
image = link + image_link
writer.writerow([link, make, model, price, image])
print(link, make, model, price, image)
outfile.close()

how to loop using beautifulsoup

I am trying to scrape data on car model, price, mileage, location, etc using beautifulsoup. However, the return result only reports data on one random car. I want to be able to collect data on all cars advertised on the site to date. My python code is below. How can I modify my code to retrieve data such that each day I have information on car model, price, mileage, location, etc? Example:
Car model price mileage location date
Toyota Corrola $4500 22km Accra 16/02/2018
Nissan Almera $9500 60km Tema 16/02/2018
etc
import requests
from bs4 import BeautifulSoup
import pandas
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html.parser")
print soup.prettify()
data = soup.find(class_='item-content')
for tag in data:
item_title = data.find("a",attrs={"class":"item-title h4"})
model = item_title.text.encode('utf-8').strip()
item_meta = data.find("p",attrs={"class":"item-meta"})
mileage = item_meta.text.encode('utf-8').strip()
item_location = data.find("p",attrs={"class":"item-location"})
location = item_location.text.encode('utf-8').strip()
item_info = data.find("p",attrs={"class":"item-info"})
price = item_info.text.encode('utf-8').strip()
with open('example.csv', 'a') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, mileage, location, datetime.now()])
First off, this loop:
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
is not doing what I assume you think it is. This loop simply resets the url 300 times and leaves you with the original url you set. You need to wrap all your code in this loop to ensure you are hitting each of the URLs you want (1-300).
Restructure your code (paying attention to indents!) so that the next url is the one being used in the request:
# This will print ALOT of titles
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?" + str(i)
print(url) # Notice how the url changes with each iteration?
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
currTitle = item.text.encode('utf-8').strip()
print(currTitle)
This code:
import requests
from bs4 import BeautifulSoup as bsoup
url = "https://tonaton.com/en/ads/ghana/cars?1"
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
titles = soup.findAll("a",attrs={"class":"item-title h4"})
for item in titles:
print(item.text.encode('utf-8').strip())
Yields (not sure what the 'b' is doing):
b'Hyundai Veloster 2013'
b'Ford Edge 2009'
b'Mercedes-Benz C300 2016'
b'Mazda Demio 2007'
b'Hyundai Santa fe 2005'
# And so on...
The problem is that 1) if you call find(), it will stop after you find the first match given your params. Using findAll() will dump all matches into a list which you then can iterate through and process as needed. And 2) the result you get from a call to find() is a broken structure of the original HTML. Thus the next find() calls won't work.
import requests
from bs4 import BeautifulSoup as bsoup
import csv
from datetime import datetime
for i in range(300):
url = "https://tonaton.com/en/ads/ghana/cars?".format(i)
r = requests.get(url)
soup = bsoup(r.content, "html.parser")
item_title = soup.findAll("a",attrs={"class":"item-title h4"})
for item in item_title:
model = item.text.encode('utf-8').strip()
item_meta = soup.findAll("p",attrs={"class":"item-meta"})
for item in item_meta:
milleage = item.text.encode('utf-8').strip()
item_location = soup.findAll("p",attrs={"class":"item-location"})
for item in item_location:
location = item.text.encode('utf-8').strip()
item_info = soup.findAll("p",attrs={"class":"item-info"})
for item in item_info:
price = item.text.encode('utf-8').strip()
with open('index.csv', 'w') as csv_file:
writer = csv.writer(csv_file)
writer.writerow([model, price, milleage, location, datetime.now()])

Trying to parse table data into csv file. Is there a way to parse dynamic genrated table data in a row in csv with BeautifulSoup python

I have a list of names and trying to parse whole table content in a row of a with Xpath. In some name if there is less content my webdriver crushed and programs stops, So I decided parse table with pandas. I did my research to parse table with pandas into csv file. But don't know how to implement it.
here is the link of table I am trying to parse in a row in csv
DLLC , ACT , OREGON , 11-25-2015 , 11-25-2017 , PPB , PRINCIPAL PLACE OF BUSINESS , 22325 SW MURPHY ST,BEAVERTON , OR and so on.
see every data field from that table will be look like this in excel in each cell. I don't want any header. I just table data in row.
Now I have list of names in csv something like this:
HALF MOON BEND FARM, LLC
NICELY GROWN LLC
COPR INCORPORATED
so on......
Here is the code:
from selenium import webdriver
from bs4 import BeautifulSoup
import lxml
import time
import csv
driver = webdriver.Chrome()
driver.get("url")
#time.sleep(5)
username = driver.find_element_by_name("p_name")
#time.sleep(1)
username.send_keys("xxxxxxx")
#username.clear()
driver.find_element_by_xpath("html/body/form/table[6]/tbody/tr/td[2]/input").click()
entity= driver.find_element_by_partial_link_text("xxxxxxx")
entity.click()
html = driver.page_source
Registry_nbr = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[1]").text
Entity_type = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[2]").text
Entity_status = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[3]").text
Registry_date = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[6]").text
#Next_renewal_date = driver.find_element_by_xpath("html/body/form/table[2]/tbody/tr[2]/td[6]").text
entity_name = driver.find_element_by_xpath("html/body/form/table[3]/tbody/tr/td[2]").text
Ttest=driver.find_element_by_xpath("html/body/form/table[32]/tbody/tr/td[2]").text
with open("sos.csv", "w") as scoreFile:
scoreFileWriter = csv.writer(scoreFile)
scoreFileWriter.writerow([Registry_nbr,Entity_type,Entity_status,Registry_date,entity_name],)
scoreFile.close()
soup =BeautifulSoup(html)
for tag in soup.find_all('table'):
print tag.text
Use this after entity.click()
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
words = soup.find_all("td")
table_data = soup.get_text().encode('utf-8')
word = list()
for cell in words:
a.append((cell.text).encode('utf-8'))
with open('name.csv', 'w') as csvfile:
spamwriter = csv.writer(csvfile,delimiter=',')
spamwriter.writerow(word)
hope this will help
Once you have the html you can parse it using BeautifulSoup and find the table you want. Looking at the HTML page you reference, I do not see any classid's or identifying keys to search for so just indexing in to table[2] will have to do.
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
NBSP = u'\xa0'
tables = [ [ map(lambda d: d.text.replace(NBSP, u''), r.findAll('td'))
for r in t.findAll('tr') ]
for t in soup.findAll('table') ]
business_entity_data = tables[2]
keys = business_entity_data[0]
with open('page.csv', 'wb') as csvfile:
csvwriter = csv.DictWriter(csvfile, keys)
csvwriter.writeheader()
csvwriter.writerow(dict(zip(keys, business_entity_data[1])))
You should end up with a file containing:
Registry Nbr,Entity Type,Entity Status,Jurisdiction,Registry Date,Next Renewal Date,Renewal Due?
1164570-94,DLLC,ACT,OREGON,11-25-2015,11-25-2017,

Categories

Resources