I am web-scraping 2 table from 2 different sites.
I want to append a new column (called WHEREFROM in the header) with a web-scraping text, in my code i called it "name".
My code is here:
from bs4 import BeautifulSoup
from selenium import webdriver
import time
import urllib2
import unicodecsv as csv
import os
import sys
import io
import time
import datetime
import pandas as pd
from bs4 import BeautifulSoup
import re
import contextlib
import selenium.webdriver.support.ui as ui
filename=r'output.csv'
resultcsv=open(filename,"wb")
output=csv.writer(resultcsv, delimiter=';',quotechar = '"', quoting=csv.QUOTE_NONNUMERIC, encoding='latin-1')
output.writerow(['TIME','FLIGHT','FROM','AIRLANE','AIRCRAFT','STATUS','WHEREFROM', 'ACTUALDATE'])
def scrape(urls):
browser = webdriver.Firefox()
for url in urls:
browser.get(url)
html = browser.page_source
soup=BeautifulSoup(html,"html.parser")
table = soup.find('table', { "class" : "table table-condensed table-hover data-table m-n-t-15" })
soup2=BeautifulSoup(html,"html.parser")
name = soup2.find('div' , attrs={'class' : 'row m-t-l m-l-l'})
datatable=[]
for record in table.find_all('tr', class_="hidden-xs hidden-sm ng-scope"):
temp_data = []
for data in record.find_all("td"):
temp_data.append(data.text.encode('latin-1'))
newlist = filter(None, temp_data)
datatable.append(newlist)
print name
output.writerows(datatable)
resultcsv.close()
time.sleep(10)
browser.close()
urls = ["https://www.flightradar24.com/data/airports/bud/arrivals", "https://www.flightradar24.com/data/airports/fco/arrivals"]
scrape(urls)
resultcsv.close()
How can I do this in a loop, and how can I do this correctly? Because after that I am writing these data to csv, where the delimiter is ; .
But after web-scraping tables there isn't any ; in the last text, so I think I have to insert a ; in this last text too?!
I am talking about this:
"1:15 PM";" KL1975";"Amsterdam (AMS)-";"KLM";"B737 (PH-BGT) ";"Landed 1:01 PM"
EDITED with the actual date (not working, format issue):
df = pd.DataFrame(newlist)
now = time.strftime('%d-%m-%Y')
df['ACTUALDATE'] = now
#df.rows = header
df.to_csv('output.csv', sep=';', encoding='latin-1', index=False)
I wrote it in the loop, to see the actual date (hours-minutes too, but this is only the day)
It seems so trivial that I'm not even sure I really understood the question... If what you want is to add name as the last element of each row in your csv, all you have to do is, well, to add it as the last element of the rows you're passing to your csv writer:
for record in table.find_all('tr', class_="hidden-xs hidden-sm ng-scope"):
temp_data = []
for data in record.find_all("td"):
temp_data.append(data.text.encode('latin-1'))
# here
temp_data.append(name)
Related
need to scrape all the table data from rajya sabha website. however, instead of scraping from the url link the code scrapes the original table page by page
from selenium import webdriver
import chromedriver_binary
import os
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import csv
import time
import lxml
url = 'https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start=0'
#url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={i}"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
table1 = soup.find('table', id='sam_table')
headers = []
for a in table1.find_all('th'):
title = a.text
headers.append(title)
rsdata = pd.DataFrame(columns = headers)
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False)
# Create a for loop to fill rajya sabha data
for k in range(0,96):
url_call = f"https://rsdebate.nic.in/simple-search?query=climate+change&sort_by=dc.identifier.sessionnumber_sort&order=asc&rpp=100&etal=0&start={k}"
page = requests.get(url_call)
for j in table1.find_all('tr')[1:]:
row_data = j.find_all('td')
row = [i.text for i in row_data]
length = len(rsdata)
rsdata.loc[length] = row
rsdata.to_csv('rs_debate_data.csv', mode ='a',index=False, header=False)
print(k)
# Export to csv
# Try to read csv
#rs_data = pd.read_csv('rs_debate_data.csv')
i was trying to scrape only rows related to keyword climate change in the debate title column of the table.
for k in range(0,96):
url_call = "..."
page = requests.get(url_call)
for j in table1.find_all('tr')[1:]:
This loop does a find_all() on the original table1 results, not on the page it just fetched...
I want to get the table from webpage
import os
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
options = Options()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--start-maximized')
options.page_load_strategy = 'eager'
options.add_argument("--headless");
driver = webdriver.Chrome(options=options)
wait = WebDriverWait(driver, 20)
driver.get("https://munafasutra.com/nse/dividends")
file_object = open('divident.csv', 'a')
output table
How to get the first table and their values?
You have to look at the HTML path and locate the WebElement that is gathering that first table (Clicking "Inspect" when you click right-button of the mouse can do the work).
You can save that webelement using the following line of code:
first_table = driver.find_element_by_xpath("//div[#id = 'co']//table[1]") # The [1] is not really necessary as when using **find_element_by_xpath** will only look for the first element.
Then, if you look at how data is organized inside that table, you can observe each row is gathered by a tr WebElement. Therefore, if you wish to write it in a csv file, I would suggest to write row by row with the following code:
rows = first_table.find_elements_by_xpath("./tbody/tr")
for row in rows:
entries_of_the_row = row.find_elements_by_xpath("./td")
row_to_csv = []
for entry in entries_of_the_row:
row_to_csv.append(entry.text)
file_object.write(f"{row_to_csv[0]}, {row_to_csv[1]}, {row_to_csv[2]}, {row_to_csv[3]}, {row_to_csv[4]}\n")
file_object.close()
You can use below XPATH to retrieve the first table value :
//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td
Something like this :
driver.get("https://munafasutra.com/nse/dividends")
first_table = driver.find_elements(By.XPATH, "//h3[text()=' Earlier dividends announced by companies ']/preceding-sibling::table/descendant::td")
for first in first_table:
print(first.text)
You can use BeautifulSoup to get the table data. Selenium is not required if you just want to extract web page data.
You need to import below packages :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
You can Extract HTML of the table using below code (soup variable will contain the HTML code of the entire page):
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
Below is the code to extract HTML for the 1st table (here table is the tag value and in [] contains the index of the table we want to extract data of):
first_table = soup.find_all('table')[0]
You can also add attributes to distinctly identify the table along with tag name.
Below is the code to extract all the rows in the selected table :
all_rows = first_table.findAll("tr")
Use the below code to write the data in csv file :
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)
Below is the complete code to extract 1st table data to csv :
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
url_munafasutra = "https://munafasutra.com/nse/dividends"
html_munafasutra = urlopen(url_munafasutra)
soup = BeautifulSoup(html_munafasutra, 'html')
first_table = soup.find_all('table')[0]
all_rows = first_table.findAll("tr")
with open("C:\\Users\\abhay\\.spyder-py3\\table_extract.csv", "wt+", newline="") as f:
table_to_csv = csv.writer(f)
for row in all_rows:
row_data = []
for cell in row.findAll(["td", "th"]):
row_data.append(cell.get_text())
table_to_csv.writerow(row_data)
I'm writing a python script to extract records of all people in a site using selenium, beautifulsoup and pandas. I, however don't know how to go about that because the site is designed such that someone has to search first before getting the result. For test purposes henceforth, I'm passing a search value and manipulating the same via selenium. The issue is that after writing the script on a python shell in ipython, I get the desirable results, but the same is throwing an error in a python file when running via python command.
code
from selenium import webdriver
from bs4 import BeautifulSoup
from time import sleep
import pandas as pd
import requests
import re
br.get(url)
content = br.page_source
soup = BeautifulSoup(content, 'lxml')
sleep(2)
sName = br.find_element_by_xpath("/html/body/div[1]/div[2]/section/div[2]/div/div/div/div/div/div/div[2]/form/div[1]/div/div/input")
sleep(3)
sName.send_keys("martin")
br.find_element_by_xpath("//*[#id='provider']/div[1]/div/div/div/button").click()
sleep(3)
table = soup.find('table')
tbody = table.find_all('tbody')
body = tbody.find_all('tr')
#
# get column heads
head = body[0]
body_rows = body[1:]
headings = []
for item in head.find_all('th'):
item = (item.text).rstrip("\n")
headings.append(item)
print(headings)
#declare an empty list for holding all records
all_rows = []
# loop through all table rows to get all table datas
for row_num in range(len(body_rows)):
row = []
for row_item in body_rows[row_num].find_all('td'):
stripA = re.sub("(\xa0)|(\n)|,","",row_item.text)
row.append(stripA)
all_rows.append(row)
# match each record to its field name
# cols = ['name', 'license', 'xxx', 'xxxx']
df = pd.DataFrame(data=all_rows, columns=headings)
You don't need the overhead of a browser or to worry about waits. You can simply mimic the post request the page makes
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
data = {'search_register': '1', 'search_text': 'Martin'}
r = requests.post('https://osp.nckenya.com/ajax/public', data=data)
soup = bs(r.content, 'lxml')
results = pd.read_html(str(soup.select_one('#datatable2')))
print(results)
I'm doing some progress with web scraping however I still need some help to perform some operations:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
# soup = BeautifulSoup(requests.get(converturl).content, 'html.parser')
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
for tr in soup.select('.col-md-4 tbody tr'):
On the class col-md-4 I know there are 3 tables I want to generate a csv which as an output has three values: first name, last name, and for the last value I want the header name of the table.
first name, last name, header table
Any help would be appreciated.
This is what I have done on my own:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
filename = url.rsplit('/', 1)[1] + '.csv'
tables = soup.select('.col-md-4 table')
rows = []
for tr in tables:
t = tr.get_text(strip=True, separator='|').split('|')
rows.append(t)
df = pd.DataFrame(rows)
print(df)
df.to_csv(filename)
Thanks,
This might work:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
tables = soup.select('.col-md-4 table')
rows = []
for table in tables:
cleaned = list(table.stripped_strings)
header, names = cleaned[0], cleaned[1:]
data = [name.split(', ') + [header] for name in names]
rows.extend(data)
result = pd.DataFrame.from_records(rows, columns=['surname', 'name', 'table'])
You need to first iterate through each table you want to scrape, then for each table, get its header and rows of data. For each row of data, you want to parse out the First Name and Last Name (along with the header of the table).
Here's a verbose working example:
import requests
import pandas as pd
from bs4 import BeautifulSoup
url = 'http://fcf.cat/equip/1920/1i/sant-ildefons-ue-b'
soup = BeautifulSoup(requests.get(url).content, 'html.parser')
out = []
# Iterate through each of the three tables
for table in soup.select(".col-md-4 table"):
# Grab the header and rows from the table
header = table.select("thead th")[0].text.strip()
rows = [s.text.strip() for s in table.select("tbody tr")]
t = [] # This list will contain the rows of data for this table
# Iterate through rows in this table
for row in rows:
# Split by comma (last_name, first_name)
split = row.split(",")
last_name = split[0].strip()
first_name = split[1].strip()
# Create the row of data
t.append([first_name, last_name, header])
# Convert list of rows to a DataFrame
df = pd.DataFrame(t, columns=["first_name", "last_name", "table_name"])
# Append to list of DataFrames
out.append(df)
# Write to CSVs...
out[0].to_csv("first_table.csv", index=None) # etc...
Whenever you're web scraping, I highly recommend using strip() on all of the text you parse to make sure you don't have superfluous spaces in your data.
I hope this helps!
I have the below link
http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Pune
In this i want to scrape data in proper format in excel.The SurveyNo link contains the data when it is click i want the row-wise data with the data on clicking the survey number.
Also want the format that i have attached in the image (desired output in excel)
import urllib.request
from bs4 import BeautifulSoup
import csv
import os
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
import time
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?
hDistName=Pune'
chrome_path =r'C:/Users/User/AppData/Local/Programs/Python/Python36/Scripts/chromedriver.exe'
driver = webdriver.Chrome(executable_path=chrome_path)
driver.implicitly_wait(10)
driver.get(url)
Select(driver.find_element_by_name('ctl00$ContentPlaceHolder5$ddlTaluka')).select_by_value('5')
Select(driver.find_element_by_name('ctl00$ContentPlaceHolder5$ddlVillage')).select_by_value('1872')
soup=BeautifulSoup(driver.page_source, 'lxml')
table = soup.find("table" , attrs = {'id':'ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate' })
with open('Baner.csv', 'w',encoding='utf-16',newline='') as csvfile:
f = csv.writer(csvfile, dialect='excel')
f.writerow(['SurveyNo','Subdivision', 'Open ground', 'Resident house','Offices','Shops','Industrial','Unit (Rs./)']) # headers
rows = table.find_all('tr')[1:]
data=[]
for tr in rows:
cols = tr.find_all('td')
for td in cols:
links = driver.find_elements_by_link_text('SurveyNo')
l =len(links)
data12 =[]
for i in range(l):
newlinks = driver.find_elements_by_link_text('SurveyNo')
newlinks[i].click()
soup = BeautifulSoup(driver.page_source, 'lxml')
td1 = soup.find("textarea", attrs={'class': 'textbox'})
data12.append(td1.text)
data.append(td.text)
data.append(data12)
print(data)
Please find the image. In that format I required the output of scrape data.
You could do the following and simply re-arrange columns at end along with desired renaming. There is the assumption SurveyNo exists for all wanted rows. I extract the hrefs from the SurveyNo cells which are actually executable strings you can pass to execute_script to show the survey numbers without worrying about stale element etc....
from selenium import webdriver
import pandas as pd
url = 'http://www.igrmaharashtra.gov.in/eASR/eASRCommon.aspx?hDistName=Pune'
d = webdriver.Chrome()
d.get(url)
d.find_element_by_css_selector('[value="5"]').click()
d.find_element_by_css_selector('[value="1872"]').click()
tableElement = d.find_element_by_id('ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate')
table = pd.read_html(tableElement.get_attribute('outerHTML'))[0]
table.columns = table.iloc[0]
table = table.iloc[1:]
table = table[table.Select == 'SurveyNo'] #assumption SurveyNo exists for all wanted rows
surveyNo_scripts = [item.get_attribute('href') for item in d.find_elements_by_css_selector("#ctl00_ContentPlaceHolder5_grdUrbanSubZoneWiseRate [href*='Select$']")]
i = 0
for script in surveyNo_scripts:
d.execute_script(script)
surveys = d.find_element_by_css_selector('textarea').text
table.iloc[i]['Select'] = surveys
i+=1
print(table)
#rename and re-order columns as required
table.to_csv(r"C:\Users\User\Desktop\Data.csv", sep=',', encoding='utf-8-sig',index = False )
Output before rename and re-order:
In a loop you can concat all dfs and then write out in one go (my preference - shown here) or later append as shown here