Extract table only from a single page using python - python

I am trying to extract a table in a word document, the following code extracts all the tables, how can I download the table only from one page, let's say page 48
from docx.api import Document
import pandas as pd
document = Document("C:/Users/xxx/Downloads/abc.docx")
tables = document.tables
df = pd.DataFrame()
for table in document.tables:
for row in table.rows:
text = [cell.text for cell in row.cells]
df = df.append([text], ignore_index=True)
df.columns = ["Column1", "Column2","Column3","Column4","Column5"]
df.to_excel("C:/Users/xxx/Downloads/output.xlsx")

Related

Extract data from a docx and insert into an xlsx or csv file in python

I'm trying to extract data from a table that is in docx, and insert this data into a file in xslx or in csv. I chose to do it in Python, due to the ease of handling data.
Attached is the formatting of the data in the table
This is the script I'm using to generate the tables in xlsx. But the data from table 1 and table 2 are joined.
Name, id, year, subject, start and observations.
So the ideal would be to create a spreadsheet with this data instead of creating 2 tables.
I got other settings in the code but to no avail.
That's the question, I'm not able to join these 2 tables
import pandas as pd
from docx import Document
document = Document(r"test.docx")
tables = []
for index,table in enumerate(document.tables):
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
df[i][j] = cell.text
pd.DataFrame(df).to_excel("Tabela__ "+str(index)+".xlsx")
print("Done")
Expected configuration
So if anyone here has experience with this kind of data transformation, I would be very grateful.
Test file 71433068_data.docx can be downloaded from here
Possible solution is the following:
# pip install python-docx
# pip install pandas
import pandas as pd
from docx import Document
document = Document("71433068_data.docx")
# group tables by two items (two tables are related with in one subject)
tbl_list_grp = [document.tables[i:i + 2] for i in range(0, len(document.tables), 2)]
all_data = []
for subject in tbl_list_grp:
tbl_data = {}
# table 1 - NAME, ID (keys are the in first column)
for i, row in enumerate(subject[0].rows):
text = (cell.text for cell in row.cells)
kv = tuple(text)
tbl_data[kv[0]]= [kv[1]] * (len(subject[1].rows)-1)
# table 2 - YEAR, ..., OBSERVATION (keys are in the first row)
for i, column in enumerate(subject[1].columns):
col_cells = list((cell.text for cell in column.cells))
tbl_data[col_cells[0]] = [_ for _ in col_cells[1:]]
all_data.append(tbl_data)
# create pandas dataframe
df = pd.concat([pd.DataFrame(i) for i in all_data]).reset_index(drop=True)
df
Returns
# write dataframe data to excel
df.to_excel('71433068_data.xlsx', encoding='urf-8', index=False)

How to concat docx file in python?

Below is my code:
v_excel= []
for root, dirs, files in os.walk(paths):
for t in files:
if t.endswith('.xlsx'):
df = pd.read_excel(os.path.join(paths,t), header=None, index_col=False)
v_excel.append(df)
conc = pd.concat(v_excel, axis=1, ignore_index=True)
conc output:
#after appending two excel files i can successively concat the files and put it in
#seperate column
column1 column2
data1 data1
data2 data2
data3 data3
data3 data4
#column 1 is from excel file 1 and column2 from excel file 2
How to do this for docx as i did for excel ?
if t.endswith('.docx'):
#for c,z in enumerate(t):
v_doc.append(Document(t)) # <-----how to put this in df and concat according to
# docx file as i have done with excel ?
docx contains:
#docx contains dummy text's !!!
#docx1 contains:
data1
data2
data3
data4
#docx2 contains:
data5
data6
data7
data8
i want to save the content of docx files to columns of excel. docx 1 content to column 1 of excel and docx 2 to column 2 of same excel.
Hope i get some response. Thank you in advance.
Solution #1: Aggregating multiple .docx documents to single output docx document.
If want to copy the text and style from a collection of docx documents to a single output docx then can use python-docx module.
from docx import Document
import os
master = Document()
for f in os.listdir('.'):
if f.endswith('.docx'):
doc = Document(f)
for p in doc.paragraphs:
out_para = master.add_paragraph()
for run in p.runs:
output_run = out_para.add_run(run.text)
# copy style from old to new
output_run.bold = run.bold
output_run.italic = run.italic
output_run.underline = run.underline
output_run.font.color.rgb = run.font.color.rgb
output_run.style.name = run.style.name
master.save('out.docx')
Solution #2: Aggregating table content from multiple .docx documents to single output excel document.
In your comments, you want to create an excel sheet from a set of word documents with tables of text.
Here is Python code to copy cells in tables of Word documents to a target Excel document.
import pandas as pd
from docx import Document
import os
df = None
for f in os.listdir('data'):
if f.endswith('.docx'):
doc = Document(file)
for table in doc.tables:
for row in table.rows:
data = []
for cell in row.cells:
data.append(cell.text)
if df is None:
df = pd.DataFrame(columns=list(range(1, len(data)+1)))
df = df.append(pd.Series(data, index=df.columns),
ignore_index=True)
df.to_excel("output.xlsx")
Solution #3: Aggregating custom table content from multiple .docx documents to single output excel document with a 2-column table.
In your particular sample data, the table is structured with either 3 or 9 columns so need to concatenate the text of other columns to a single value if want to keep 2 columns in output.
df = None
for f in os.listdir('data'):
if f.endswith('.docx'):
doc = Document(file)
# iterate over all the tables
for table in doc.tables:
for row in table.rows:
cells = row.cells
if len(cells) > 1:
col1 = cells[0].text
# check if first column is not empty
if col1:
# concatenate text of cells to a single value
text = ''
for i in range(1, len(cells)):
if len(text) != 0:
text += ' '
text += cells[i].text
data = [cells[0].text, text]
if df is None:
df = pd.DataFrame(columns=['column1', 'column2'])
df = df.append(pd.Series(data, index=df.columns),
ignore_index=True)
# save output
df.to_excel("output.xlsx")
You can docxcompose to concat docx files in python. you can read more descriptions in docxcompose's pypi official page

Python: How to Webscrape All Rows from a Specific Table

For practice, I am trying to webscrape financial data from one table in this url: https://www.macrotrends.net/stocks/charts/TSLA/tesla/revenue
I'd like to save the data from the "Tesla Quarterly Revenue" table into a data frame and return two columns: Data, Revenue.
Currently the code as it runs now is grabbing data from the adjacent table, "Tesla Annual Revenue." Since the tables don't seem to have unique id's from which to separate them in this instance, how would I select elements only from the "Tesla Quarterly Revenue" table?
Any help or insight on how to remedy this would be deeply appreciated.
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = "https://www.macrotrends.net/stocks/charts/TSLA/tesla/revenue"
html_data = requests.get(url).text
soup = BeautifulSoup(html_data, 'html5lib')
tesla_revenue = pd.DataFrame(columns=["Date", "Revenue"])
for row in soup.find("tbody").find_all("tr"):
col = row.find_all("td")
date = col[0].text
revenue = col[1].text
tesla_revenue = tesla_revenue.append({"Date":date, "Revenue":revenue},ignore_index=True)
tesla_revenue.head()
Below are the results when I run this code:
You can let pandas do all the work
import pandas as pd
url = "https://www.macrotrends.net/stocks/charts/TSLA/tesla/revenue"
tables = pd.read_html(url)
for df in tables:
# loop over all found tables
pass
# quarterly revenue is the second table
df = tables[1]
df.columns = ['Date', 'Revenue'] # rename the columns if you want to
print(df)

Extracting tables from a word doc

Is there any tool to extract all tables from a word documents and converting them to a csv file or any excel extension file using python or vba
note that the word file contains both text and tables.
You can use pandas with python-docx. Per this answer you can extract all tables from a document and put them in a list:
from docx import Document
import pandas as pd
document = Document('test.docx')
tables = []
for table in document.tables:
df = [['' for i in range(len(table.columns))] for j in range(len(table.rows))]
for i, row in enumerate(table.rows):
for j, cell in enumerate(row.cells):
if cell.text:
df[i][j] = cell.text
tables.append(pd.DataFrame(df))
You can then save the tables to csv files by looping through the list:
for nr, i in enumerate(tables):
i.to_csv("table_" + str(nr) + ".csv")

Scraping a table with row labels in Python using Beautiful Soup

I'm trying to scrape a table from a website that has row labels. I'm able to get the actual data from the table, but I have no idea how to get the row labels as well.
Here is my code right now:
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
url = "http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/dt-td/Rp-eng.cfm?TABID=2&LANG=E&A=R&APATH=3&DETAIL=0&DIM=0&FL=A&FREE=0&GC=01&GL=-1&GID=1341679&GK=1&GRP=1&O=D&PID=110719&PRID=10&PTYPE=109445&S=0&SHOWALL=0&SUB=0&Temporal=2017&THEME=125&VID=0&VNAMEE=&VNAMEF=&D1=0&D2=0&D3=0&D4=0&D5=0&D6=0"
res = urllib.request.urlopen(url)
html = res.read()
## parse with BeautifulSoup
bs = BeautifulSoup(html, "html.parser")
tables = bs.find_all("table")
table = tables[0]
df = pd.DataFrame()
rows = table.find_all("tr")
#extract the first column name (Employment income groups (18))
column_names = []
header_cells = rows[0].find_all("th")
for cell in header_cells:
header = cell.text
header = header.strip()
header = header.replace("\n", " ")
column_names.append(header)
#extract the rest of the column names
header_cells = rows[1].find_all("th")
for cell in header_cells:
header = cell.text
header = header.strip()
header = header.replace("\n", " ")
column_names.append(header)
#this is an extra label
column_names.remove('Main mode of commuting (10)')
#get the data from the table
data = []
for row in rows[2:]:
## create an empty tuple
dt = ()
cells = row.find_all("td")
for cell in cells:
## dp stands for "data point"
font = cell.find("font")
if font is not None:
dp = font.text
else:
dp = cell.text
dp = dp.strip()
dp = dp.replace("\n", " ")
## add to tuple
dt = dt + (dp,)
data.append(dt)
df = pd.DataFrame(data, columns = column_names)
Creating the dataframe will give an error because the code above only extracts the cells with data points but does not extract the first cell of each row that contains the row label.
That is, there are 11 column names, but the tuples only have 10 values because it is not extracting the row label (ie, Total - Employment income) because they are of "th" type.
How can I get the row label and put it into the tuple as I process the rest of the data in the table?
Thank you for your help.
(The table I am trying to scrape is on this site if it's not clear from the code)
Use this table.findAll('th',{'headers':'col-0'}) to find row labels
lab = []
labels = table.findAll('th',{'headers':'col-0'})
for label in labels:
data = str(label.text).strip()
data = str(data).split("($)Footnote", 1)[0]
lab.append(data)
#print(data)
EDIT:
Using pandas.read_html
import numpy as np
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
url = "http://www12.statcan.gc.ca/census-recensement/2016/dp-pd/dt-td/Rp-eng.cfm?TABID=2&LANG=E&A=R&APATH=3&DETAIL=0&DIM=0&FL=A&FREE=0&GC=01&GL=-1&GID=1341679&GK=1&GRP=1&O=D&PID=110719&PRID=10&PTYPE=109445&S=0&SHOWALL=0&SUB=0&Temporal=2017&THEME=125&VID=0&VNAMEE=&VNAMEF=&D1=0&D2=0&D3=0&D4=0&D5=0&D6=0"
res = urllib.request.urlopen(url)
html = res.read()
## parse with BeautifulSoup
bs = BeautifulSoup(html, "html.parser")
tables = bs.find_all("table")
df = (pd.read_html(str(tables)))[0]
#print(df)
columns = ['Employment income groups (18)','Total - Main mode of commuting','Car, truck or van','Driver, alone',
'2 or more persons shared the ride to work','Driver, with 1 or more passengers',
'Passenger, 2 or more persons in the vehicle','Sustainable transportation',
'Public transit','Active transport','Other method']
df.columns = columns
Edit 2: Element wont be accesible by index because strings are not proper strings (Employment income groups (18) column labels). I have the edited the code again.

Categories

Resources