Panda read html select specific table values

Panda read html select specific table values - python

How to select specific HTML Table, TH using python pandas, for example, from the table the TAG 2.4.33 needs to pick using python panda code return by parsing the HTML page
root#1ec99b8b97af:/opt# python lookuptag.py
Id Tag Created Layers Size Delete
0 bb84b573f76 2.4.33 2 years ago 22 179.6 MB Delete
1 bb84b573f76 2.4.33-t2 2 years ago 22 179.6 MB Delete
2 5c97c0e3531 v8-2.4.33 1 year ago 22 180.7 MB Delete
Here is my Python panda code, I can print the HTML, using the code
import requests
import pandas as pd
url = 'http://docker-registry:8080/repo/tags/httpd'
html = requests.get(url).content
df_list = pd.read_html(html, header =0, flavor = 'bs4')
df = df_list[-1]
print(df)

def FetchTable(context,tablexpath):
url = 'https://www.espncricinfo.com/table/series/8048/season/2020/indian-premier-league'
tables = pd.read_html(url)
table = tables[0].applymap(str)
return table
def LookupValueInColumnTwoKeys(context, source_table, reference_column_1, reference_value_1, reference_column_2, reference_value_2, lookup_column):
lookup_column = lookup_column.replace(' ', '')
reference_value_2 = reference_value_2.replace(' ', '')
reference_value_1 = reference_value_1.replace(' ', '')
referenceindex=0
referenceindex1=0
referenceindexfound=False
referenceindexfound1=False
lookupcolumnindex =0
rowindex=0
rowindexfound=False
lookupcolumnindexfound=False
for headers in source_table.columns:
if referenceindexfound == False:
referenceindex=referenceindex+1
if referenceindexfound1 == False:
referenceindex1=referenceindex1+1
if lookupcolumnindexfound == False:
lookupcolumnindex=lookupcolumnindex+1
if headers == reference_column_1 :
referenceindexfound = True
if headers == reference_column_2 :
referenceindexfound1 = True
if headers == lookup_column:
lookupcolumnindexfound = True
if referenceindexfound == True & lookupcolumnindexfound == True & referenceindexfound1 == True:
break
for tablerow in source_table.values:
print(tablerow)
if rowindexfound == False:
print(tablerow[referenceindex-1])
print(tablerow[referenceindex1-1])
if tablerow[referenceindex-1].find(reference_value_1)!= -1 and tablerow[referenceindex1-1].find(reference_value_2)!= -1 :
rowindexfound = True
#rowindex=rowindex+1
else:
rowindex=rowindex+1
else:
break
print("source table"+source_table.values[rowindex][lookupcolumnindex-1])
return source_table.values[rowindex][lookupcolumnindex-1]
Another files
from behave import *
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from readTableDataFromDB import readTableDataFromDB
from pandacode import WebTableValidationHelper as pandacode
from selenium.webdriver.chrome.options import Options
context.driver.get("https://www.espncricinfo.com/table/series/8048/season/2020/indian-premier-league")
matrix = pandacode.FetchTable(context,"//*[#class='table table-sm standings-widget-table text-center mb-0 border-bottom']")
3ismatrixequal = pandacode.VerifyTable(context,matrix,matrix)
#print(ismatrixequal)
lookupvalue = pandacode.LookupValueFromColumnSingleKey(context,matrix,"TEAM", "Delhi Capitals", "PT")
print(lookupvalue)
another code:
def LookupValueFromColumnSingleKey1(context, source_table,reference_column_1, rowName, columnName):
referenceindex=0
referenceindexfound=False
columnindex =0
rowindex=0
rowindexfound=False
columnindexfound=False
for headers in source_table.columns:
if referenceindexfound == False:
referenceindex= referenceindex+1
if columnindexfound == False:
columnindex= columnindex+1
if headers == reference_column_1 :
referenceindexfound = True
if headers == columnName:
columnindexfound = True
if referenceindexfound == True & columnindexfound == True:
break
for tablerow in source_table.values:
#print(tablerow)
if rowindexfound == False:
rowindex=rowindex+1
for tupledata in tablerow:
#if tupledata.find(rowName)!= -1: c
if tupledata.lower() == rowName.lower():
print(tupledata)
rowindexfound = True
#print("source table"+source_table.values[rowindex-1][columnindex-1])
#print(source_table[columnindex][rowindex])
return source_table.values[rowindex-1][columnindex-1]

Related

Selenium ElementClickInterceptedException when headless = True

I am running a selenium code on the website DNCA to scrap for some of the document links. I am trying to get links of each value in the drop down for each section shown in this page. My code is working fine, but when I run the same code with option headless = True, I am getting the following error:
ElementClickInterceptedException: element click intercepted: Element <li data-original-index="0">...</li> is not clickable at point (226, 250). Other element would receive the click: <div class="col-md-12">...</div>
(Session info: headless chrome=104.0.5112.81)
Code:
def get_active_row(active_tab, fund_id):
active_row = active_tab.find_elements(By.XPATH, ".//tr[#style='' or #style='display: table-row;'][#fund-id = '{}']".format(fund_id))
try:
assert len(active_row) == 1
active_row = active_row[0]
return active_row
except AssertionError as asserr:
print(asserr, ' -- More than one active row for the fund id: ', fund_id)
sys.exit(1)
except Exception as err:
print(err, ' -- fund id:', fund_id)
sys.exit(1)
def scrap(driver):
tab_list = driver.find_element(By.XPATH, "//ul[contains(#role, 'tablist')]")
tab_list_names = tab_list.find_elements(By.XPATH, './/li')
data_list = []
for loc, tab_name in enumerate(tab_list_names):
if loc < 20:
tab_name.click()
html = driver.page_source
soup = BeautifulSoup(html)
bs_active_tab = soup.find('div', {'class': 'tab-pane table-datas active'})
bs_headers = bs_active_tab.find('thead')
headers = [i.text for i in bs_headers.find_all('td')]
active_tab = driver.find_element(By.XPATH, "//div[contains(#class, 'tab-pane table-datas active')]")
unique_fund_ids = [i_fund.get_attribute('fund-id') for i_fund in active_tab.find_elements(By.XPATH, ".//tr[#style]") if i_fund.get_attribute('fund-id') != '-']
lookup = set()
unique_fund_ids = [x for x in unique_fund_ids if x not in lookup and lookup.add(x) is None]
for fund_id in unique_fund_ids: #Iterate over each fund
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_list = [i.text for i in active_row.find_elements(By.XPATH, './/li')]
for pos, isin_val in enumerate(isin_list):
isin_selected = active_row.find_elements(By.XPATH, './/li')[pos]
isin_selected.click()
active_row = get_active_row(active_tab, fund_id)
fund_name = ''
for pos_inner, td in enumerate(active_row.find_elements(By.XPATH, ".//td")):
a_tag = td.find_elements(By.XPATH, ".//a")
if len(a_tag) == 1:
a_tag = a_tag[0]
if pos_inner == 0:
fund_name = a_tag.text
link = a_tag.get_attribute('href')
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], link])
else:
data_list.append([tab_name.text, fund_name, isin_val, headers[pos_inner], ''])
active_row = get_active_row(active_tab, fund_id)
active_row.find_element(By.XPATH, './/button').click()
isin_selected_to_close = active_row.find_elements(By.XPATH, './/li')[0]
isin_selected_to_close.click()
tlg_tr_tab = active_tab.find_element(By.XPATH, ".//tr[#fund-id='-']")
for tlg_pos_inner, tlg_td in enumerate(tlg_tr_tab.find_elements(By.XPATH, ".//td")):
tlg_a_tag = tlg_td.find_elements(By.XPATH, ".//a")
if len(tlg_a_tag) == 1:
tlg_a_tag = tlg_a_tag[0]
tlg_link = tlg_a_tag.get_attribute('href') #Get document link
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], tlg_link])
else:
data_list.append([tab_name.text, 'Toute la gamme', '', headers[tlg_pos_inner], ''])
dataset_links = pd.DataFrame(data_list, columns = ['Tab', 'Fund Name', 'ISIN', 'Type', 'Link'])
driver.quit()
Can someone please explain me why is it working fine with headless = False but not with with headless = True.

In headless mode the default screen size is very small, significantly less than screen size in regular mode.
So, to overcome this problem you need to set the screen size.
It can be done in the following ways:
options = Options()
options.add_argument("--headless")
options.add_argument("window-size=1920, 1080")
webdriver_service = Service('C:\webdrivers\chromedriver.exe')
driver = webdriver.Chrome(service=webdriver_service, options=options)
Or just
driver.set_window_size(1920, 1080)
Both approaches should work.
I prefer the first way :)

Python, SqlAlchemy. Or_ expected 2 arguments, got 3

I have this problem with my code. When Iinsert three or more params in the body request, I get this Error "POST Error: or_ expected 2 arguments, got 3."
I can only pass one or two parameters in the body, in this case it works fine. But I don't understand where is the mistake. Can someone help me?
def read_uptime(logid, filteredData, dateStart, dateEnd, timeStart, timeEnd, elementsForPage, currentPage, filterUptime):
log.info(f"{logid} read_uptime: Started")
try:
# Check Timeframe Correct
startDateTime, endDateTime = _checkDataInput(timeStart, timeEnd, dateStart, dateEnd)
# Create Filter
filters = _createFilter(filteredData, startDateTime, endDateTime, filterUptime)
# Query
dataFiltered = uptime_model_db.query.with_entities(
uptime_model_db.projectId.label('projectId'),
uptime_model_db.url.label('url'),
uptime_model_db.timeStamp.label('timeStamp'),
uptime_model_db.uptime.label('uptime'),
uptime_model_db.latency.label('latency')
).filter(*filters).paginate(per_page=int(elementsForPage + 1), page=int(currentPage), error_out=True)
# Checking more pages
nextPage = {
"currentPage": currentPage,
"totalElements": len(dataFiltered.items)
}
if (len(dataFiltered.items) > elementsForPage):
nextPage["nextPage"] = True
else:
nextPage["nextPage"] = False
# Format and return JSON
return _createJson(dataFiltered.items, nextPage)
except Exception as e:
log.error(f"{logid} read_uptime: function read_uptime returned {e}")
raise e
i get in this code the mistake: "array.Filter.append(and_(uptime_model.db.porjectId == projectId, or_(*arrayUrl))"
def filterAppend(arrayFilter, urls, projectId, arrayUrl):
if(len(urls) == 1):
arrayFilter.append(and_(uptime_model_db.projectId == projectId, uptime_model_db.url == urls[0]))
if(len(urls) > 1):
for url in urls:
arrayUrl.append(uptime_model_db.url == url)
arrayFilter.append(and_(uptime_model_db.projectId == projectId, or_(*arrayUrl)))
i get in this code the mistake:
"filters.append(or_(*arrayFilter))"
def _createFilter(filteredData, startDateTime, endDateTime, filterUptime):
filters = []
if filteredData is not None:
arrayFilter = []
for data in filteredData:
projectId = data["projectId"]
urls = data["uptimeUrls"]
arrayUrl = []
if (len(filteredData) == 1):
filterAppend(filters, urls, projectId, arrayUrl)
else:
filterAppend(arrayFilter, urls, projectId, arrayUrl)
if(len(filteredData) > 1 or len(arrayFilter) > 1):
filters.append(or_(*arrayFilter))
if startDateTime is not None:
filters.append(str(startDateTime) <= uptime_model_db.timeStamp)
if startDateTime is not None:
filters.append(str(endDateTime) >= uptime_model_db.timeStamp)
if filterUptime == "True":
filters.append(uptime_model_db.uptime < 100)
return filters

import or_ from sqlalchemy instead of operators:
from sqlalchemy import or_

What is the best way to parse large XML and genarate a dataframe with the data in the XML (with python or else)?

I try to make a table (or csv, I'm using pandas dataframe) from the information of an XML file.
The file is here (.zip is 14 MB, XML is ~370MB), https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip . It has package information of different languages - node.js, python, java etc. aka, CPE 2.3 list by the US government org NVD.
this is how it looks like in the first 30 rows:
<cpe-list xmlns:config="http://scap.nist.gov/schema/configuration/0.1" xmlns="http://cpe.mitre.org/dictionary/2.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:scap-core="http://scap.nist.gov/schema/scap-core/0.3" xmlns:cpe-23="http://scap.nist.gov/schema/cpe-extension/2.3" xmlns:ns6="http://scap.nist.gov/schema/scap-core/0.1" xmlns:meta="http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2" xsi:schemaLocation="http://scap.nist.gov/schema/cpe-extension/2.3 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary-extension_2.3.xsd http://cpe.mitre.org/dictionary/2.0 https://scap.nist.gov/schema/cpe/2.3/cpe-dictionary_2.3.xsd http://scap.nist.gov/schema/cpe-dictionary-metadata/0.2 https://scap.nist.gov/schema/cpe/2.1/cpe-dictionary-metadata_0.2.xsd http://scap.nist.gov/schema/scap-core/0.3 https://scap.nist.gov/schema/nvd/scap-core_0.3.xsd http://scap.nist.gov/schema/configuration/0.1 https://scap.nist.gov/schema/nvd/configuration_0.1.xsd http://scap.nist.gov/schema/scap-core/0.1 https://scap.nist.gov/schema/nvd/scap-core_0.1.xsd">
<generator>
<product_name>National Vulnerability Database (NVD)</product_name>
<product_version>4.9</product_version>
<schema_version>2.3</schema_version>
<timestamp>2022-03-17T03:51:01.909Z</timestamp>
</generator>
<cpe-item name="cpe:/a:%240.99_kindle_books_project:%240.99_kindle_books:6::~~~android~~">
<title xml:lang="en-US">$0.99 Kindle Books project $0.99 Kindle Books (aka com.kindle.books.for99) for android 6.0</title>
<references>
<reference href="https://play.google.com/store/apps/details?id=com.kindle.books.for99">Product information</reference>
<reference href="https://docs.google.com/spreadsheets/d/1t5GXwjw82SyunALVJb2w0zi3FoLRIkfGPc7AMjRF0r4/edit?pli=1#gid=1053404143">Government Advisory</reference>
</references>
<cpe-23:cpe23-item name="cpe:2.3:a:\$0.99_kindle_books_project:\$0.99_kindle_books:6:*:*:*:*:android:*:*"/>
</cpe-item>
The tree structure of the XML file is quite simple, the root is 'cpe-list', the child element is 'cpe-item', and the grandchild elements are 'title', 'references' and 'cpe23-item'.
From 'title', I want the text in the element;
From 'cpe23-item', I want the attribute 'name';
From 'references', I want the attributes 'href' from its great-grandchildren, 'reference'.
The dataframe should look like this:
| cpe23_name | title_text | ref1 | ref2 | ref3 | ref_other
0 | 'cpe23name 1'| 'this is a python pkg'| 'url1'| 'url2'| NaN | NaN
1 | 'cpe23name 2'| 'this is a java pkg' | 'url1'| 'url2'| NaN | NaN
...
my code is here，finished in ~100sec：
import xml.etree.ElementTree as et
xtree = et.parse("official-cpe-dictionary_v2.3.xml")
xroot = xtree.getroot()
import time
start_time = time.time()
df_cols = ["cpe", "text", "vendor", "product", "version", "changelog", "advisory", 'others']
title = '{http://cpe.mitre.org/dictionary/2.0}title'
ref = '{http://cpe.mitre.org/dictionary/2.0}references'
cpe_item = '{http://scap.nist.gov/schema/cpe-extension/2.3}cpe23-item'
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
rows = []
i = 0
while i < len(xroot):
for elm in xroot[i]:
if elm.tag == title:
p_text = elm.text
#assign p_text
elif elm.tag == ref:
for nn in elm:
s = nn.text.lower()
#check the lower text in refs
if 'version' in s:
p_vers = nn.attrib.get('href')
#assign p_vers
elif 'advisor' in s:
p_advi = nn.attrib.get('href')
#assign p_advi
elif 'product' in s:
p_prod = nn.attrib.get('href')
#assign p_prod
elif 'vendor' in s:
p_vend = nn.attrib.get('href')
#assign p_vend
elif 'change' in s:
p_chan = nn.attrib.get('href')
#assign p_vend
else:
p_othe = nn.attrib.get('href')
elif elm.tag == cpe_item:
p_cpe = elm.attrib.get("name")
#assign p_cpe
else:
print(elm.tag)
row = [p_cpe, p_text, p_vend, p_prod, p_vers, p_chan, p_advi, p_othe]
rows.append(row)
p_cpe = None
p_text = None
p_vend = None
p_prod = None
p_vers = None
p_chan = None
p_advi = None
p_othe = None
print(len(rows)) #this shows how far I got during the running time
i+=1
out_df1 = pd.DataFrame(rows, columns = df_cols)# move this part outside the loop by removing the indent
print("---853k rows take %s seconds ---" % (time.time() - start_time))
updated: the faster way is to move the 2nd last row out side the loop. Since 'rows' already get info in each loop, there is no need to make a new dataframe every time.
the running time now is 136.0491042137146 seconds. yay!

Since your XML is fairly flat, consider the recently added IO module, pandas.read_xml introduced in v1.3. Given XML uses a default namespace, to reference elements in xpath use namespaces argument:
url = "https://nvd.nist.gov/feeds/xml/cpe/dictionary/official-cpe-dictionary_v2.3.xml.zip"
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}
)
If you do not have the default parser, lxml, installed, use the etree parser:
df = pd.read_xml(
url, xpath=".//doc:cpe-item", namespaces={'doc': 'http://cpe.mitre.org/dictionary/2.0'}, parser="etree"
)

Python prints 3 times

Why does python prints andata/ritorno 3 times? I've tried to outdent if, but it doesnt work
listlf = []
connf = sqlite3.connect("Database.db")
coursorf = connf.cursor()
sqltf = coursorf.execute("select np,id from orari")
for lf1 in sqltf:
lf2 = [lf1[0],lf1[1]]
listlf.append(lf2)
for lf3 in listlf:
if town == str(lf3[0]) and bs == str(lf3[1]):
loc1 = listlf.index(lf3)
for lf3 in listlf:
if des_town == str(lf3[0]) and des_bs == str(lf3[1]):
loc2 = listlf.index(lf3)
if loc1 < loc2:
print("andata")
else:
print("ritorno")
Output:
andata
andata
andata

You have two nested loops using the same variable:
for lf3 in listlf: # lf3 !
if town == str(lf3[0]) and bs == str(lf3[1]):
loc1 = listlf.index(lf3)
for lf3 in listlf: # lf3 too!
Whatever you tried to do, this seems wrong.

'Jump' Operation in Python to Skip Rows in DataFrame

I have an Excel file contain data like in that picture.
"doc_id" refers to the document ID where the text comes from. In our example, we have 4 documents (doc_id from 0 to 3).
I want to get the values of "text" in the first 5 pages of each document OR before Table of Contents.
With our example, it should return:
"A0","A1","B1","A3"
(Note that we don't want B0, C0, D0, C1 because they occur after Table of Contents of that document, and we don't want A2 and B3 because they have page_id >= 5)
I don't understand how we can create condition to "break" the iteration in each doc_id once we find Table of Contents or page_id == 5 and move to the next doc_id.
I tried like this and I'm stuck.
import pandas as pd
data = pd.read_csv('book2.csv')
test_data = data['text']
doc_id = data['doc_id']
page_id = data['page_id']
def TOC(text):
return 'content' in text
def new_doc():
if i==0:
return False
elif doc_id[i] != doc_id[i-1]:
return True
i=0
while i < len(test_data):
stop=0
while stop == 0 and not new_doc():
if TOC(test_data[i]):
print('toc')
stop=1
else:
print(doc_id[i],test_data[i])
i+=1
Appreciate your help. Thanks!

See if this helps
a = df[df.page_id<5]
def tex(x):
try:
if (x.any()):
i = x.index[x.str.contains('Table')][0]
except IndexError :
i = x.index[-1]+1
return i
a[a.index<a.groupby('doc_id')['text'].transform(tex)]['text'].to_list()
Output
['A0', 'A1', 'B1', 'A3']

you have to iterate through whole document
import pandas as pd
data = pd.read_csv('book2.csv')[['page_id', 'doc_id', 'text']]
curr_doc_id = -1
before_toc = False
for i, row in data.iterrows():
if curr_doc_id < row.doc_id:
curr_doc_id = row.doc_id
before_toc = True
if row.text == "Table of Contents":
before_toc = False
if before_toc and row.page_id < 5:
print(row)
*code wasn't tested

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Panda read html select specific table values - python

Related

Selenium ElementClickInterceptedException when headless = True

Python, SqlAlchemy. Or_ expected 2 arguments, got 3

What is the best way to parse large XML and genarate a dataframe with the data in the XML (with python or else)?

Python prints 3 times

'Jump' Operation in Python to Skip Rows in DataFrame

Categories

Resources