Writing scraped headers from webpages to pandas frame - python

Wrote this code to download h1,h2 and h3 headers and write to a pandas frame along with a list of urls but it gives error as unpacking error expected 3 values.
def url_corrector(url):
if not str(url).startswith('http'):
return "https://"+str(url)
else:
return str(url)
def header_agg(url):
h1_list = []
h2_list = []
h3_list = []
p = requests.get(url_corrector(url),proxies = proxy_data,verify=False)
soup = BeautifulSoup(p.text,'lxml')
for tag in soup.find_all('h1'):
h1_list.append(tag.text)
for tag in soup.find_all('h2'):
h2_list.append(tag.text)
for tag in soup.find_all('h3'):
h3_list.append(tag.text)
return h1_list, h2_list, h3_list
headers_frame = url_list.copy()
headers_frame['H1'],headers_frame['H2'],headers_frame['H3'] = headers_frame.url.map(lambda x: header_agg(x))
Any help on how to do it?
Getting this error:
ValueError: too many values to unpack (expected 3)

Lets assume that url_list is a dict with the following structure:
url_list = {'url': [<url1>, <url2>, <url3>, <url4>, ..., <urln>]}
the call to headers_frame.url.map(lambda x: header_agg(x)) will return a list with n elements in the form:
[<url1(h1_list, h2_list, h3_list)>, <url2(h1_list, h2_list, h3_list)>, ..., <urln(h1_list, h2_list, h3_list)>]
For the code to produce the output you require, you may have to re-write the last statement as a loop
headers_frame.update({'H1':[], 'H2':[], 'H3':[]})
for url in headers_frame.url:
headers = header_agg(url)
headers_frame['H1'].extend(headers[0])
headers_frame['H2'].extend(headers[1])
headers_frame['H3'].extend(headers[2])

You have to return one entity. Just change:
return [h1_list, h2_list, h3_list]

Did this to work around this issue. However, still unsure why the original isn't working.
headers_frame = url_list.copy()
H1=[]
H2=[]
H3=[]
for url in headers_frame.url:
k = header_agg(url)
H1.append(k[0])
H2.append(k[1])
H3.append(k[2])
pd.DataFrame(np.column_stack([headers_frame.url,H1,H2,H3]))

Related

Gathering Values between XML tags in Python

Here's my code Dunno why I'm getting an empty dataframe. I have tried using BeautifulSoup too but did not work well as well. I just need the value between the tags.
root = ET.parse("DLTINS_20210117_01of01.xml").getroot()
for i in root.findall("FinInstrmRptgRefDataDltaRpt"):
for j in i.findall("FinInstrmGnlAttrbts"):
Id = j.find("Id").text
FullNm = j.find("FullNm").text
ClssfctnTp = j.find("ClssfctnTp").text
CmmdtyDerivInd = j.find("CmmdtyDerivInd").text
NtnlCcy = j.find("NtnlCcy").text
Issr = i.find("Issr").text
rows.append({
"Id":Id,
"FullNm":FullNm,
"ClssfctnTp":ClssfctnTp,
"CmmdtyDerivInd":CmmdtyDerivInd,
"NtnlCcy":NtnlCcy,
"Issr":Issr
})
df = pd.DataFrame(rows,columns=cols)
Here's the XML file

python merge df of scraping

i need your help to join two data frame of two sector table scrapings
the sample url of many is http://www.mercadopublico.cl/Procurement/Modules/RFB/DetailsAcquisition.aspx?idlicitacion=4593-2-L122
for i in popup_linkz: # set end at 121` so it will use `120`, if you set end at `120` then it will finish on `80` # eliminate and make the url equal to i to test
url=i
soup = BeautifulSoup(requests.get(i).content, "html.parser")
licitation_number = soup.select_one("#lblNumLicitacion").text
responsable = soup.select_one("#lblResponsable").text
ficha = soup.select_one("#lblFicha2Reclamo").text
nombre_licitacion=soup.select_one("#lblNombreLicitacion").text
#print(f"{licitation_number=}")
#print(f"{responsable=}")
#print(f"{ficha=}")
#print(f"{nombre_licitacion=}")
#print(f"#lblFicha1Tipo")
#print("-" * 80)
for t in soup.select("#grvProducto .borde_tabla00"):
categoria = t.select_one('[id$="lblCategoria"]').text
candidad = t.select_one('[id$="lblCantidad"]').text
descripction = t.select_one('[id$="lblDescripcion"]').text
#print(f"{categoria=} {candidad=}")
results.append( (licitation_number, responsable, ficha, nombre_licitacion,categoria,candidad,descripction))
#print()
for z in soup.select("#Ficha1 .tabla_ficha_00"):
monto=z.select_one('[id$="lblFicha1Tipo"]').text
estado=z.select_one('[id$="lblFicha1TituloEstado"]').text
#comuna=z.select_one('[id$="lblFicha2TituloComuna"]').text
results2.append( (monto,estado) )
print('results')
print(f"{monto=}")
import pandas as pd
df1=results
df2=results2
df3=pd.merge(results,results2)
df = pd.DataFrame(data = results[1:],columns = results[0])
df.to_excel('licitaciones1.xlsx', index=False,header = False)#Writing to Excel file
i am getting this error
TypeError: Can only merge Series or DataFrame objects, a <class 'list'> was passed
not sure why but im trying to solve but not so good so far...
so if you can help me i would be really glad
results look like these
results2 like these
just had to extract the unique value before on the first part sorry for the question I will not delete it since maybe is helpulf for someone
url=i
soup = BeautifulSoup(requests.get(i).content, "html.parser")
licitation_number = soup.select_one("#lblNumLicitacion").text
responsable = soup.select_one("#lblResponsable").text
ficha = soup.select_one("#lblFicha2Reclamo").text
nombre_licitacion=soup.select_one("#lblNombreLicitacion").text
monto=soup.select_one("#lblFicha1Tipo").text# here is the answer
#print(f"{licitation_number=}")
#print(f"{responsable=}")
#print(f"{ficha=}")
#print(f"{nombre_licitacion=}")
#print(f"#lblFicha1Tipo")
#print("-" * 80)

Indexing issue that doesn't make sense when trying to scrape using BeautifulSoup

I'm trying to use the script below to go through a list of urls, find the date of each race per url and find the location of each race per url. I am getting an IndexError for out of range, but I know that the lists that I'm iterating over are all the same length and these errors don't make sense. Also when runnning through Pycharm I get different points at which the IndexErrors occur when compared to running through terminal. I wasn't going to post here, but I'm seriously confused and wondering if anyone else can replicate what I'm seeing and has an explanation of what I'm missing. Here's the code and the list:
import urllib.request
from bs4 import BeautifulSoup
with open('hk_pages.txt', 'r') as urls:
starting_list = urls.read().split()
for url in starting_list:
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, "html.parser")
# Track
tracksoup = str(soup.findAll("td", {"class": "racingTitle"}))
tracklist = tracksoup.split('>')
track = tracklist[1][:2]
# Date
datesoup = str(soup.findAll("td", {"class": "tdAlignL number13 color_black"}))
datelist = datesoup.split()
date = datelist[6]
print(date)
print(track)
print("**************************************************")
Here's the list of urls:
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150906
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150909
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150913
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150916
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150919
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150923
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20150928
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151001
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151004
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151007
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151010
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151014
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151017
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151018
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151022
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151025
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151031
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151101
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151103
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151107
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151108
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151111
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151114
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151118
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151121
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151125
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151129
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151202
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151206
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151209
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151213
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151216
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151219
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151223
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20151227
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160101
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160106
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160109
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160113
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160117
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160120
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160124
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160131
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160203
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160206
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160210
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160214
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160217
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160221
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160224
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160227
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160228
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160302
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160305
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160306
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160309
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160313
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160316
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160319
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160320
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160323
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160326
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160328
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160331
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160402
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160403
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160406
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160409
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160410
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160413
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160416
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160417
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160420
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160424
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160427
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160501
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160504
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160507
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160511
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160514
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160518
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160522
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160529
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160601
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160604
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160605
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160609
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160612
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160614
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160615
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160616
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160618
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160619
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160622
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160626
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160701
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160706
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160710
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160903
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160907
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160911
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160918
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160921
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160925
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20160928
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161001
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161002
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161005
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161008
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161012
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161015
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161016
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161019
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161022
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161023
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161026
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161029
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161030
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161101
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161102
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161105
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161106
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161109
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161112
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161116
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161118
http://racing.hkjc.com/racing/info/meeting/ResultsAll/English/Local/20161120
# Track
tracksoup = str(soup.findAll("td", {"class": "racingTitle"}))
tracklist = tracksoup.split('>')
track = tracklist[1][:2]
the problem is you can not str([item,item...]), soup.findAll will return a list, if you try this, the out_put will be:
'[item,item...]'
this is not what you what.

How can I replace multiple try/except blocks with less code?

I find myself writing code as below quite a bit. It's very verbose. What I'd like to do is assign array indeces to different variables, and if there's an indexerror, assign false. I feel like there should be a shorter syntax for doing this (compared to what I have below).
Edit - here's my actual code. page is a valid lxml.html object. Each of the selectors may or may not return a value, depending on whether that section is present on the page.
def extract_data( page ):
# given lxml.html obj, extract data from g+ page and return as dict
try:
profile_name = page.xpath( '//div[#guidedhelpid="profile_name"]/text()' )[0]
except IndexError:
profile_name = False
try:
website = page.cssselect( 'span.K9a' )[0].text_content().rstrip('/')
except IndexError:
website = False
try:
contact_div = html.tostring( page.xpath( '//div[normalize-space(text())="Contact Information"]/../../..' )[0] )
except IndexError:
contact_div = False
return {
'profile_name' : profile_name,
'website' : website,
'contact_div' : contact_div,
}
Assuming what you're trying to do makes sense within the context of your use case, you can encapsulate this notion of a default value inside a function:
def retrieve(my_list, index, default_value=False):
try:
return my_list[index]
except IndexError:
return default_value
That way you can do something like:
my_list = [2, 4]
first = retrieve(my_list, 0)
# first will be 2
second = retrieve(my_list, 1)
# second will be 4
third = retrieve(my_list, 2)
# third will be False
You can even change the value you'd like to default to in case the index does not exist.
In general, when you're repeating code like in the manner you're doing above, the first thing you should think about is whether you can write a function that does what you're trying to do.
Using your actual code, you could do something like:
profile_name = retrieve(page.xpath( '//div[#guidedhelpid="profile_name"]/text()'), 0)
website = retrieve(page.cssselect( 'span.K9a' ), 0)
if website:
website = website.text_content().rstrip('/')
contact_div = retrieve(page.xpath( '//div[normalize-space(text())="Contact Information"]/../../..' ), 0)
if contact_div:
contact_div = html.tostring(contact_div)
vars = ['first', 'second', 'third']
r = {}
for i, var in enumerate(vars):
try:
r[var] = l[i]
except IndexError:
r[var] = False
This should solve your problem :) exec + looping to the rescue!
l = list([0,2])
numberWords = { 0:"first", 1:"second", 2:"third"}
for i in range(0,len(l)):
try:
exec(numberWords[i]+"=l["+str(i)+"]")
except IndexError:
exec(numberWords[i]+"=false")

Organizing Results in Python

Alright, so basically I have a Google script that searches for a keyword. The results look like:
http://www.example.com/user/1234
http://www.youtube.com/user/125
http://www.forum.com/user/12
What could I do to organize these results like this?:
Forums:
http://www.forum.com/user/12
YouTubes:
http://www.youtube.com/user/125
Unidentified:
http://www.example.com/user/1234
By the way I'm organizing them with keywords. If the url has "forum" in it then it goes to the forum list, if it has YouTube it goes to the YouTube list, but if no keywords match up then it goes to unidentified.
1/. Create a dict, and assign an empty list to each keyword you have.
eg
my_dict = {'forums':[],'youtube':[],'unidentified':[]}
2/.Iterate over your urls.
3/. Generate a key for your url,domain name in your case, you can extract the key using re regex module.
4/ Check the dictionary ( of step#1) for this key, if it does not exist, assign it to 'unidentified key, if it exists, append this url to the list in the dictionary with that key.
Something like this? I guess you will be able to adapt this example to your needs
import pprint
import re
urls = ['http://www.example.com/user/1234',
'http://www.youtube.com/user/126',
'http://www.youtube.com/user/125',
'http://www.forum.com/useryoutube/12']
pattern = re.compile('//www\.(\w+)\.')
keys = ['forum', 'youtube']
results = dict()
for u in urls:
ms = pattern.search(u)
key = ms.group(1)
if key in keys:
results.setdefault(key, []).append(u)
pprint.pprint(results)
import urlparse
urls = """
http://www.example.com/user/1234
http://www.youtube.com/user/125
http://www.forum.com/user/12
""".split()
categories = {
"youtube.com": [],
"forum.com": [],
"unknown": [],
}
for url in urls:
netloc = urlparse.urlparse(url).netloc
if netloc.count(".") == 2:
# chop sub-domain
netloc = netloc.split(".", 1)[1]
if netloc in categories:
categories[netloc].append(url)
else:
categories["unknown"].append(url)
print categories
Parse the urls. Find the category. Append the full url
You should probably keep your sorted results in a dictionary and the unsorted ones in a list. You could then sort it like so:
categorized_results = {"forum": [], "youtube": []}
uncategorized_results = []
for i in results:
i = i.split(".")
for k in categorized_results:
j = True
if k in i:
categorized_results[k].append(i)
j = False
if j:
uncategorized_results.append(i)
If you'd like to output it neatly:
category_aliases: {"forum": "Forums:", "youtube": "Youtubes:"}
for i in categorized_results:
print(category_aliases[i])
for j in categorized_results[i]:
print(j)
print("\n")
print("Unidentified:")
print("\n".join(uncategorized_results)) # Let's not put in another for loop.
How about this:
from urlparse import urlparse
class Organizing_Results(object):
CATEGORY = {'example': [], 'youtube': [], 'forum': []}
def __init__(self):
self.url_list = []
def add_single_url(self, url):
self.url_list.append(urlparse(url))
def _reduce_result_list(self, acc, element):
for c in self.CATEGORY:
if c in element[1]:
return self.CATEGORY[c].append(element)
return self.CATEGORY['example'].append(element)
def get_result(self):
reduce(lambda x, y: c._reduce_result_list(x, y), c.url_list, [])
return self.CATEGORY
c = Organizing_Results()
c.add_single_url('http://www.example.com/user/1234')
c.add_single_url('http://www.youtube.com/user/1234')
c.add_single_url('http://www.unidentified.com/user/1234')
c.get_result()
You can easy broaden the class with more functions as you need.

Categories

Resources