python merge df of scraping - python

i need your help to join two data frame of two sector table scrapings
the sample url of many is http://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?idlicitacion=4593-2-L122
for i in popup_linkz: # set end at 121` so it will use `120`, if you set end at `120` then it will finish on `80` # eliminate and make the url equal to i to test
url=i
soup = BeautifulSoup(requests.get(i).content, "html.parser")
licitation_number = soup.select_one("#lblNumLicitacion").text
responsable = soup.select_one("#lblResponsable").text
ficha = soup.select_one("#lblFicha2Reclamo").text
nombre_licitacion=soup.select_one("#lblNombreLicitacion").text
#print(f"{licitation_number=}")
#print(f"{responsable=}")
#print(f"{ficha=}")
#print(f"{nombre_licitacion=}")
#print(f"#lblFicha1Tipo")
#print("-" * 80)
for t in soup.select("#grvProducto .borde_tabla00"):
categoria = t.select_one('[id$="lblCategoria"]').text
candidad = t.select_one('[id$="lblCantidad"]').text
descripction = t.select_one('[id$="lblDescripcion"]').text
#print(f"{categoria=} {candidad=}")
results.append( (licitation_number, responsable, ficha, nombre_licitacion,categoria,candidad,descripction))
#print()
for z in soup.select("#Ficha1 .tabla_ficha_00"):
monto=z.select_one('[id$="lblFicha1Tipo"]').text
estado=z.select_one('[id$="lblFicha1TituloEstado"]').text
#comuna=z.select_one('[id$="lblFicha2TituloComuna"]').text
results2.append( (monto,estado) )
print('results')
print(f"{monto=}")
import pandas as pd
df1=results
df2=results2
df3=pd.merge(results,results2)
df = pd.DataFrame(data = results[1:],columns = results[0])
df.to_excel('licitaciones1.xlsx', index=False,header = False)#Writing to Excel file
i am getting this error
TypeError: Can only merge Series or DataFrame objects, a <class 'list'> was passed
not sure why but im trying to solve but not so good so far...
so if you can help me i would be really glad
results look like these
results2 like these

just had to extract the unique value before on the first part sorry for the question I will not delete it since maybe is helpulf for someone
url=i
soup = BeautifulSoup(requests.get(i).content, "html.parser")
licitation_number = soup.select_one("#lblNumLicitacion").text
responsable = soup.select_one("#lblResponsable").text
ficha = soup.select_one("#lblFicha2Reclamo").text
nombre_licitacion=soup.select_one("#lblNombreLicitacion").text
monto=soup.select_one("#lblFicha1Tipo").text# here is the answer
#print(f"{licitation_number=}")
#print(f"{responsable=}")
#print(f"{ficha=}")
#print(f"{nombre_licitacion=}")
#print(f"#lblFicha1Tipo")
#print("-" * 80)

Related

Gathering Values between XML tags in Python

Here's my code Dunno why I'm getting an empty dataframe. I have tried using BeautifulSoup too but did not work well as well. I just need the value between the tags.
root = ET.parse("DLTINS_20210117_01of01.xml").getroot()
for i in root.findall("FinInstrmRptgRefDataDltaRpt"):
for j in i.findall("FinInstrmGnlAttrbts"):
Id = j.find("Id").text
FullNm = j.find("FullNm").text
ClssfctnTp = j.find("ClssfctnTp").text
CmmdtyDerivInd = j.find("CmmdtyDerivInd").text
NtnlCcy = j.find("NtnlCcy").text
Issr = i.find("Issr").text
rows.append({
"Id":Id,
"FullNm":FullNm,
"ClssfctnTp":ClssfctnTp,
"CmmdtyDerivInd":CmmdtyDerivInd,
"NtnlCcy":NtnlCcy,
"Issr":Issr
})
df = pd.DataFrame(rows,columns=cols)
Here's the XML file

How to resolve - ValueError: cannot set using a multi-index selection indexer with a different length than the value in Python

I have some sample code that I use to analyze entities and its sentiments using Google's natural language API. For every record in my Pandas dataframe, I want to return a list of dictionaries where each element is an entity. However, I am running into issues when trying to have it work on the production data. Here is the sample code
from google.cloud import language_v1 # version 2.0.0
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/to/json'
import pandas as pd
# establish client connection
client = language_v1.LanguageServiceClient()
# helper function
def custom_analyze_entity(text_content):
global client
#print("Accepted Input::" + text_content)
document = language_v1.Document(content=text_content, type_=language_v1.Document.Type.PLAIN_TEXT, language = 'en')
response = client.analyze_entity_sentiment(request = {'document': document})
# a document can have many entities
# create a list of dictionaries, every element in the list is a dictionary that represents an entity
# the dictionary is nested
l = []
#print("Entity response:" + str(response.entities))
for entity in response.entities:
#print('=' * 20)
temp_dict = {}
temp_meta_dict = {}
temp_mentions = {}
temp_dict['name'] = entity.name
temp_dict['type'] = language_v1.Entity.Type(entity.type_).name
temp_dict['salience'] = str(entity.salience)
sentiment = entity.sentiment
temp_dict['sentiment_score'] = str(sentiment.score)
temp_dict['sentiment_magnitude'] = str(sentiment.magnitude)
for metadata_name, metadata_value in entity.metadata.items():
temp_meta_dict['metadata_name'] = metadata_name
temp_meta_dict['metadata_value'] = metadata_value
temp_dict['metadata'] = temp_meta_dict
for mention in entity.mentions:
temp_mentions['mention_text'] = str(mention.text.content)
temp_mentions['mention_type'] = str(language_v1.EntityMention.Type(mention.type_).name)
temp_dict['mentions'] = temp_mentions
#print(u"Appended Entity::: {}".format(temp_dict))
l.append(temp_dict)
return l
I have tested it on sample data and it works fine
# works on sample data
data= ['Grapes are good. Bananas are bad.', 'the weather is not good today', 'Michelangelo Caravaggio, Italian painter, is known for many arts','look i cannot articulate how i feel today but its amazing to be back on the field with runs under my belt.']
input_df = pd.DataFrame(data=data, columns = ['freeform_text'])
for i in range(len(input_df)):
op = custom_analyze_entity(input_df.loc[i,'freeform_text'])
input_df.loc[i, 'entity_object'] = op
But when i try to parse it thru production data using below code, it fails with multi-index error. I am not able to reproduce the error using the sample pandas dataframe.
for i in range(len(input_df)):
op = custom_analyze_entity(input_df.loc[i,'freeform_text'])
input_df.loc[i, 'entity_object'] = op
...
Traceback (most recent call last):
File "<stdin>", line 3, in <module>
File "/opt/conda/default/lib/python3.6/site-packages/pandas/core/indexing.py", line 670, in __setitem__
iloc._setitem_with_indexer(indexer, value)
File "/opt/conda/default/lib/python3.6/site-packages/pandas/core/indexing.py", line 1667, in _setitem_with_indexer
"cannot set using a multi-index "
ValueError: cannot set using a multi-index selection indexer with a different length than the value
Try doing this:
input_df.loc[0, 'entity_object'] = ""
for i in range(len(input_df)):
op = custom_analyze_entity(input_df.loc[i,'freeform_text'])
input_df.loc[i, 'entity_object'] = op
Or for your specific case, you don't need to use the loc function.
input_df["entity_object"] = ""
for i in range(len(input_df)):
op = custom_analyze_entity(input_df.loc[i,'freeform_text'])
input_df["entity_object"][i] = op

Writing scraped headers from webpages to pandas frame

Wrote this code to download h1,h2 and h3 headers and write to a pandas frame along with a list of urls but it gives error as unpacking error expected 3 values.
def url_corrector(url):
if not str(url).startswith('http'):
return "https://"+str(url)
else:
return str(url)
def header_agg(url):
h1_list = []
h2_list = []
h3_list = []
p = requests.get(url_corrector(url),proxies = proxy_data,verify=False)
soup = BeautifulSoup(p.text,'lxml')
for tag in soup.find_all('h1'):
h1_list.append(tag.text)
for tag in soup.find_all('h2'):
h2_list.append(tag.text)
for tag in soup.find_all('h3'):
h3_list.append(tag.text)
return h1_list, h2_list, h3_list
headers_frame = url_list.copy()
headers_frame['H1'],headers_frame['H2'],headers_frame['H3'] = headers_frame.url.map(lambda x: header_agg(x))
Any help on how to do it?
Getting this error:
ValueError: too many values to unpack (expected 3)
Lets assume that url_list is a dict with the following structure:
url_list = {'url': [<url1>, <url2>, <url3>, <url4>, ..., <urln>]}
the call to headers_frame.url.map(lambda x: header_agg(x)) will return a list with n elements in the form:
[<url1(h1_list, h2_list, h3_list)>, <url2(h1_list, h2_list, h3_list)>, ..., <urln(h1_list, h2_list, h3_list)>]
For the code to produce the output you require, you may have to re-write the last statement as a loop
headers_frame.update({'H1':[], 'H2':[], 'H3':[]})
for url in headers_frame.url:
headers = header_agg(url)
headers_frame['H1'].extend(headers[0])
headers_frame['H2'].extend(headers[1])
headers_frame['H3'].extend(headers[2])
You have to return one entity. Just change:
return [h1_list, h2_list, h3_list]
Did this to work around this issue. However, still unsure why the original isn't working.
headers_frame = url_list.copy()
H1=[]
H2=[]
H3=[]
for url in headers_frame.url:
k = header_agg(url)
H1.append(k[0])
H2.append(k[1])
H3.append(k[2])
pd.DataFrame(np.column_stack([headers_frame.url,H1,H2,H3]))

Indexing issue that doesn't make sense when trying to scrape using BeautifulSoup

I'm trying to use the script below to go through a list of urls, find the date of each race per url and find the location of each race per url. I am getting an IndexError for out of range, but I know that the lists that I'm iterating over are all the same length and these errors don't make sense. Also when runnning through Pycharm I get different points at which the IndexErrors occur when compared to running through terminal. I wasn't going to post here, but I'm seriously confused and wondering if anyone else can replicate what I'm seeing and has an explanation of what I'm missing. Here's the code and the list:
import urllib.request
from bs4 import BeautifulSoup
with open('hk_pages.txt', 'r') as urls:
starting_list = urls.read().split()
for url in starting_list:
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
# Track
tracksoup = str(soup.findAll("td", {"class": "racingTitle"}))
tracklist = tracksoup.split('>')
track = tracklist[1][:2]
# Date
datesoup = str(soup.findAll("td", {"class": "tdAlignL number13 color_black"}))
datelist = datesoup.split()
date = datelist[6]
print(date)
print(track)
print("**************************************************")
Here's the list of urls:
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150906
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150909
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150913
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150916
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150919
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150923
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150928
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151001
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151004
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151007
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151010
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151014
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151017
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151018
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151022
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151025
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151031
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151101
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151103
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151107
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151108
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151111
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151114
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151118
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151121
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151125
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151129
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151202
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151206
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151209
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151213
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151216
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151219
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151223
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151227
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160101
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160106
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160109
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160113
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160117
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160120
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160124
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160131
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160203
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160206
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160210
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160214
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160217
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160221
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160224
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160227
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160228
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160302
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160305
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160306
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160309
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160313
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160316
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160319
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160320
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160323
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160326
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160328
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160331
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160402
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160403
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160406
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160409
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160410
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160413
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160416
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160417
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160420
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160424
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160427
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160501
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160504
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160507
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160511
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160514
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160518
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160522
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160529
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160601
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160604
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160605
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160609
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160612
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160614
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160615
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160616
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160618
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160619
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160622
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160626
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160701
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160706
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160710
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160903
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160907
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160911
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160918
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160921
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160925
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160928
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161001
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161002
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161005
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161008
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161012
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161015
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161016
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161019
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161022
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161023
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161026
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161029
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161030
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161101
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161102
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161105
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161106
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161109
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161112
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161116
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161118
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161120
# Track
tracksoup = str(soup.findAll("td", {"class": "racingTitle"}))
tracklist = tracksoup.split('>')
track = tracklist[1][:2]
the problem is you can not str([item,item...]), soup.findAll will return a list, if you try this, the out_put will be:
'[item,item...]'
this is not what you what.

How to parse a single-column text file into a table using python?

I'm new here to StackOverflow, but I have found a LOT of answers on this site. I'm also a programming newbie, so i figured i'd join and finally become part of this community - starting with a question about a problem that's been plaguing me for hours.
I login to a website and scrape a big body of text within the b tag to be converted into a proper table. The layout of the resulting Output.txt looks like this:
BIN STATUS
8FHA9D8H 82HG9F RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
INVENTORY CODE: FPBC *SOUP CANS LENTILS
BIN STATUS
HA8DHW2H HD0138 RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU 00A123 #2956- INVALID STOCK COUPON CODE (MISSING).
93827548 096DBR RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
There are a bunch of pages with the exact same blocks, but i need them to be combined into an ACTUAL table that looks like this:
BIN INV CODE STATUS
HA8DHW2HHD0138 FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8SHDNADU00A123 FPBC-*SOUP CANS LENTILS #2956- INVALID STOCK COUPON CODE (MISSING).
93827548096DBR FPBC-*SOUP CANS LENTILS RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
8FHA9D8H82HG9F SSXR-98-20LM NM CORN CREAM RECEIVED SUCCESSFULLY AWAITING STOCKING PROCESS
Essentially, all separate text blocks in this example would become part of this table, with the inv code repeating with its Bin values. I would post my attempts at parsing this data(have tried Pandas/bs/openpyxl/csv writer), but ill admit they are a little embarrassing, as i cannot find any information on this specific problem. Is there any benevolent soul out there that can help me out? :)
(Also, i am using Python 2.7)
A simple custom parser like the following should do the trick.
from __future__ import print_function
def parse_body(s):
line_sep = '\n'
getting_bins = False
inv_code = ''
for l in s.split(line_sep):
if l.startswith('INVENTORY CODE:') and not getting_bins:
inv_data = l.split()
inv_code = inv_data[2] + '-' + ' '.join(inv_data[3:])
elif l.startswith('INVENTORY CODE:') and getting_bins:
print("unexpected inventory code while reading bins:", l)
elif l.startswith('BIN') and l.endswith('MESSAGE'):
getting_bins = True
elif getting_bins == True and l:
bin_data = l.split()
# need to add exception handling here to make sure:
# 1) we have an inv_code
# 2) bin_data is at least 3 items big (assuming two for
# bin_id and at least one for message)
# 3) maybe some constraint checking to ensure that we have
# a valid instance of an inventory code and bin id
bin_id = ''.join(bin_data[0:2])
message = ' '.join(bin_data[2:])
# we now have a bin, an inv_code, and a message to add to our table
print(bin_id.ljust(20), inv_code.ljust(30), message, sep='\t')
elif getting_bins == True and not l:
# done getting bins for current inventory code
getting_bins = False
inv_code = ''
A rather complex one, but this might get you started:
import re, pandas as pd
from pandas import DataFrame
rx = re.compile(r'''
(?:INVENTORY\ CODE:)\s*
(?P<inv>.+\S)
[\s\S]+?
^BIN.+[\n\r]
(?P<bin_msg>(?:(?!^\ ).+[\n\r])+)
''', re.MULTILINE | re.VERBOSE)
string = your_string_here
# set up the dataframe
df = DataFrame(columns = ['BIN', 'INV', 'MESSAGE'])
for match in rx.finditer(string):
inv = match.group('inv')
bin_msg_raw = match.group('bin_msg').split("\n")
rxbinmsg = re.compile(r'^(?P<bin>(?:(?!\ {2}).)+)\s+(?P<message>.+\S)\s*$', re.MULTILINE)
for item in bin_msg_raw:
for m in rxbinmsg.finditer(item):
# append it to the dataframe
df.loc[len(df.index)] = [m.group('bin'), inv, m.group('message')]
print(df)
Explanation
It looks for INVENTORY CODE and sets up the groups (inv and bin_msg) for further processing in afterwork() (note: it would be easier if you had only one line of bin/msg as you need to split the group here afterwards).
Afterwards, it splits the bin and msg part and appends all to the df object.
I had a code written for a website scrapping which may help you.
Basically what you need to do is write click on the web page go to html and try to find the tag for the table you are looking for and using the module (i am using beautiful soup) extract the information. I am creating a json as I need to store it into mongodb you can create table.
#! /usr/bin/python
import sys
import requests
import re
from BeautifulSoup import BeautifulSoup
import pymongo
def req_and_parsing():
url2 = 'http://businfo.dimts.in/businfo/Bus_info/EtaByRoute.aspx?ID='
list1 = ['534UP','534DOWN']
for Route in list1:
final_url = url2 + Route
#r = requests.get(final_url)
#parsing_file(r.text,Route)
outdict = []
outdict = [parsing_file( requests.get(url2+Route).text,Route) for Route in list1 ]
print outdict
conn = f_connection()
for i in range(len(outdict)):
insert_records(conn,outdict[i])
def parsing_file(txt,Route):
soup = BeautifulSoup(txt)
table = soup.findAll("table",{"id" : "ctl00_ContentPlaceHolder1_GridView2"})
#trtags = table[0].findAll('tr')
tdlist = []
trtddict = {}
"""
for trtag in trtags:
print 'print trtag- ' , trtag.text
tdtags = trtag.findAll('td')
for tdtag in tdtags:
print tdtag.text
"""
divtags = soup.findAll("span",{"id":"ctl00_ContentPlaceHolder1_ErrorLabel"})
for divtag in divtags:
for divtag in divtags:
print "div tag - " , divtag.text
if divtag.text == "Currently no bus is running on this route" or "This is not a cluster (orange bus) route":
print "Page not displayed Errored with below meeeage for Route-", Route," , " , divtag.text
sys.exit()
trtags = table[0].findAll('tr')
for trtag in trtags:
tdtags = trtag.findAll('td')
if len(tdtags) == 2:
trtddict[tdtags[0].text] = sub_colon(tdtags[1].text)
return trtddict
def sub_colon(tag_str):
return re.sub(';',',',tag_str)
def f_connection():
try:
conn=pymongo.MongoClient()
print "Connected successfully!!!"
except pymongo.errors.ConnectionFailure, e:
print "Could not connect to MongoDB: %s" % e
return conn
def insert_records(conn,stop_dict):
db = conn.test
print db.collection_names()
mycoll = db.stopsETA
mycoll.insert(stop_dict)
if __name__ == "__main__":
req_and_parsing()

Categories

Resources