Scraping Yahoo Finance Balance Sheet with Python - python

My question is a follow-up question to one asked here.
The function:
periodic_figure_values()
seems to work fine except in the case where the name of a line item being searched appears twice. The specific case I am referring to is trying to get data for "Long Term Debt". The function in the link above will return the following error:
Traceback (most recent call last):
File "test.py", line 31, in <module>
LongTermDebt=(periodic_figure_values(soup, "Long Term Debt"))
File "test.py", line 21, in periodic_figure_values
value = int(str_value)
ValueError: invalid literal for int() with base 10: 'Short/Current Long Term Debt'
because it seems to get tripped up on "Short/Current Long Term Debt". You see, the page has both "Short/Current Long Term Debt" and "Long Term Debt". You can see an example of the source page using Apple's balance sheet here.
I'm trying to find a way for the function to return data for "Long Term Debt" without getting tripped up on "Short/Current Long Term Debt".
Here is the function and an example that fetches "Cash and Cash Equivalents", which works fine, and "Long Term Debt", which does not work:
import requests, bs4, re
def periodic_figure_values(soup, yahoo_figure):
values = []
pattern = re.compile(yahoo_figure)
title = soup.find("strong", text=pattern) # works for the figures printed in bold
if title:
row = title.parent.parent
else:
title = soup.find("td", text=pattern) # works for any other available figure
if title:
row = title.parent
else:
sys.exit("Invalid figure '" + yahoo_figure + "' passed.")
cells = row.find_all("td")[1:] # exclude the <td> with figure name
for cell in cells:
if cell.text.strip() != yahoo_figure: # needed because some figures are indented
str_value = cell.text.strip().replace(",", "").replace("(", "-").replace(")", "")
if str_value == "-":
str_value = 0
value = int(str_value)
values.append(value)
return values
res = requests.get('https://ca.finance.yahoo.com/q/bs?s=AAPL')
res.raise_for_status
soup = bs4.BeautifulSoup(res.text, 'html.parser')
Cash=(periodic_figure_values(soup, "Cash And Cash Equivalents"))
print(Cash)
LongTermDebt=(periodic_figure_values(soup, "Long Term Debt"))
print(LongTermDebt)

The easiest would be to use a try/except combination using the raised ValueError:
import requests, bs4, re
def periodic_figure_values(soup, yahoo_figure):
values = []
pattern = re.compile(yahoo_figure)
title = soup.find("strong", text=pattern) # works for the figures printed in bold
if title:
row = title.parent.parent
else:
title = soup.find("td", text=pattern) # works for any other available figure
if title:
row = title.parent
else:
sys.exit("Invalid figure '" + yahoo_figure + "' passed.")
cells = row.find_all("td")[1:] # exclude the <td> with figure name
for cell in cells:
if cell.text.strip() != yahoo_figure: # needed because some figures are indented
str_value = cell.text.strip().replace(",", "").replace("(", "-").replace(")", "")
if str_value == "-":
str_value = 0
### from here
try:
value = int(str_value)
values.append(value)
except ValueError:
continue
### to here
return values
res = requests.get('https://ca.finance.yahoo.com/q/bs?s=AAPL')
res.raise_for_status
soup = bs4.BeautifulSoup(res.text, 'html.parser')
Cash=(periodic_figure_values(soup, "Cash And Cash Equivalents"))
print(Cash)
LongTermDebt=(periodic_figure_values(soup, "Long Term Debt"))
print(LongTermDebt)
This one prints out your numbers quite fine.
Note, that you do not really need the re module in this situation here as you're checking for literals only (no wildcards, no boundaries), etc.

You could change the function so that it accepts a regular expression instead of a plain string. Then you can search for ^Long Term Debt to make sure there's no text before that. All you need to do is to change
if cell.text.strip() != yahoo_figure:
to
if not re.match(yahoo_figure, cell.text.strip()):

Related

Python Streamlit, and yfinance issues

I'll just list the two bugs I know as of now, and if you have any recommendations for refactoring my code let me know I'll go ahead and list out the few known issues as of now.
yfinance is not appending the dividendYield to my dict, I did make sure that their is an actual Dividend Yield for those Symbols.
TypeError: can only concatenate str (not "Tag") to str which I assume is something to do with how it parsing through the xml, and it ran into a tag so I am not able to create the expander, I thought I could solve it with this if statement, but instead I just don't get any expander at all.
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
Full code for main.py:
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
# Get the entities from each heading, link it with nasdaq data // if possible, and Extract market data with yfinance.
stock_dict = {
'Org': [],
'Symbol': [],
'currentPrice': [],
'dayHigh': [],
'dayLow': [],
'forwardPE': [],
'dividendYield': []
}
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
stock_info = yf.Ticker(symbol).info
print(symbol)
stock_dict['Org'].append(org_name)
stock_dict['Symbol'].append(symbol)
stock_dict['currentPrice'].append(
stock_info['currentPrice'])
stock_dict['dayHigh'].append(stock_info['dayHigh'])
stock_dict['dayLow'].append(stock_info['dayLow'])
stock_dict['forwardPE'].append(stock_info['forwardPE'])
stock_dict['dividendYield'].append(
stock_info['dividendYield'])
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame.from_dict(stock_dict, orient='index')
output_df = output_df.transpose()
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
print(fin_headings)
# Output financial info
output_df = stock_info(fin_headings)
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
if heading == str:
st.markdown("* " + heading)
else:
pass
There is an issue in your logic in stock_info function because of which same symbol is getting different values and when you are cleaning the duplicate, based on occurrence of the symbol its retaining the row with first occurrence of symbol.
The below code will solve both of your issues.
import requests
import spacy
import pandas as pd
import yfinance as yf
import streamlit as st
from bs4 import BeautifulSoup
st.title("Fire stocks :fire:")
nlp = spacy.load("en_core_web_sm")
def extract_rss(rss_link):
# Parses xml, and extracts the headings.
headings = []
response1 = requests.get(
"http://feeds.marketwatch.com/marketwatch/marketpulse/")
response2 = requests.get(rss_link)
parse1 = BeautifulSoup(response1.content, features="xml")
parse2 = BeautifulSoup(response2.content, features="xml")
headings1 = parse1.findAll('title')
headings2 = parse2.findAll('title')
headings = headings1 + headings2
return headings
def stock_info(headings):
stock_info_list = []
stocks_df = pd.read_csv("./data/nasdaq_screener_1658383327100.csv")
for title in headings:
doc = nlp(title.text)
for ent in doc.ents:
try:
if stocks_df['Name'].str.contains(ent.text).sum():
symbol = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Symbol'].values[0]
org_name = stocks_df[stocks_df['Name'].str.contains(
ent.text)]['Name'].values[0]
# Recieve info from yfinance
print(symbol)
stock_info = yf.Ticker(symbol).info
stock_info['Org'] = org_name
stock_info['Symbol'] = symbol
stock_info_list.append(stock_info)
else:
# If name can't be found pass.
pass
except:
# Don't raise an error.
pass
output_df = pd.DataFrame(stock_info_list)
return output_df
# Add input field input field
user_input = st.text_input(
"Add rss link here", "https://www.investing.com/rss/news.rss")
# Get financial headlines
fin_headings = extract_rss(user_input)
output_df = stock_info(fin_headings)
output_df = output_df[['Org','Symbol','currentPrice','dayHigh','dayLow','forwardPE','dividendYield']]
output_df.drop_duplicates(inplace=True, subset='Symbol')
st.dataframe(output_df)
with st.expander("Expand for stocks news"):
for heading in fin_headings:
heading = heading.text
if type(heading) == str:
st.markdown("* " + heading)
else:
pass
For issue #2 the patch code that you posted has a small mistake. Rather than checking if heading == str, which does something completely different than you intended and will always be False, you want to check if isinstance(heading, str). That way you get True if heading is a string and False if not. However, even then, it should not be a solution as heading is not a string. Instead you want to call get_text on heading to get the actual text part of the parsed object.
heading.get_text()
More information would be needed to solve issue #1. What does stock_dict look like before you create the Dataframe out of it? Specifically, what values are in stock_dict['dividendYield']? Can you print it and add it to your question?
Also, about the refactoring part. An
else:
pass
block does completely nothing and should be deleted. (When the if condition is false nothing happens anyways)

Trying to crawl all newslinks in a site (the parsed link only shows 10 results per page, I would need to find ALL links)

I am trying to crawl all news link that has a certain keyword that is looking for.
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re
key_word = urllib.parse.quote("금리")
url = "https://search.naver.com/search.naver?where=news&query=" + key_word +"%EA%B8%88%EB%A6%AC&sm=tab_opt&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so%3Ar%2Cp%3Afrom20200413to20200414%2Ca%3Aall&mynews=0&refresh_start=0&related=0"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
anchor_set = soup.findAll('a')
news_link = []
for a in anchor_set:
if str(a).find('https://news.naver.com/main/read.nhn?') != -1:
a = a.get('href')
news_link.append(a)'
Untill this section (code above), I parse into the url and retrieve all links that has a certian read.nhn(naver news platform) and append it to news_link.
This is working fine, but the proble is the url used above only shows 10 articles in the page.
count_tag = soup.find("div",{"class","title_desc all_my"})
count_text=count_tag.find("span").get_text().split()
total_num=count_text[-1][0:-1].replace(",","")
print(total_num)'
Using the code above I've found out there are a total of 1297 articles that I need to collect. but since the original link above only has 10 articles in the page.
for val in range(int(total_num)//10+1):
start_val=str(val*10+1)
I was told i needed to insert this into the url to retrieve ALL newslinks.
Thus, I've used the while method
while start_val <= total_num:
url = "https://search.naver.com/search.naver?where=news&query=" + key_word +"%EA%B8%88%EB%A6%AC&sm=tab_opt&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so%3Ar%2Cp%3Afrom20200413to20200414%2Ca%3Aall&mynews=0&refresh_start=" + start_val + "&related=0"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
news_link = []
anchor_set = soup.findAll('a')
for a in anchor_set:
if str(a).find('https://news.naver.com/main/read.nhn?') != -1:
a = a.get('href')
news_link.append(a)
However, when I run the program, it seems the loop does not stop. obviously there is no else or break.. How can i break this loop and successfully collect all the links?
Your current while loop doesn't stop because you haven't incremented the value of start_val. Also, later you have range(int(total_num)//10+1) so if you converted total_num to a string, then the string comparison in while start_val <= total_num is wrong - for strings "21" > "1297", because "2" > "1". Compare them as int's.
And since you're creating the sequence of vals to use, you don't need a separate upper bound check.
So far, this would give you the correct finite loop:
for val in range(int(total_num)//10+1): # no upper bound check needed
start_val=str(val*10+1)
url = "https://search.naver.com/search.naver?where=news&query=" ...
html = urllib.request.urlopen(url).read()
...
For the values needed for the pages/next starting item, instead of doing:
for val in range(int(total_num)//10+1):
start_val = str(val*10+1)
You can get the actual val's from range(). To starting at 1 and going in steps of 10 to get: 1, 11, 21, ... , upto and including the total:
for val in range(1, total_num + 1, 10):
start_val = str(val) # don't need this assignment actually
Next thing: the URL for page 2 onwards is wrong. Currently, your while loop will generate the following URL for page 2:
https://search.naver.com/search.naver?where=news&query=%EA%B8%88%EB%A6%AC%EA%B8%88%EB%A6%AC&sm=tab_opt&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so%3Ar%2Cp%3Afrom20200413to20200414%2Ca%3Aall&mynews=0&refresh_start=11&related=0
But if you click on page "2" of the results, you get the URL:
https://search.naver.com/search.naver?&where=news&query=%EA%B8%88%EB%A6%AC%EA%B8%88%EB%A6%AC&sm=tab_pge&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14&docid=&nso=so:r,p:from20200413to20200414,a:all&mynews=0&cluster_rank=35&start=11&refresh_start=0
The main difference is at the end: &refresh_start=11 in yours vs &start=11&refresh_start=0 actual. Since that format also works for page 1 (just checked), use that instead.
You have some extra characters in the section after the keyword: ...&query=" + key_word +"%EA%B8%88%EB%A6%AC&sm=tab_opt. That %EA%B8%88%EB%A6%AC is from your previous search keyword.
You can also skip several unneeded URL parameters, by testing which are actually not needed.
Putting all that together:
for val in range(1, total_num + 1, 10):
start_val = str(val)
url = ("https://search.naver.com/search.naver?&where=news&query=" +
key_word +
"&sm=tab_pge&sort=0&photo=0&field=0&reporter_article=&pd=3&ds=2020.04.13&de=2020.04.14" +
"&docid=&nso=so:r,p:from20200413to20200414,a:all&mynews=0&cluster_rank=51" +
"&refresh_start=0&start=" +
start_val)
html = urllib.request.urlopen(url).read()
... # etc.

Cannot download Wordnet Error

I am trying to compile this code:
from collections import OrderedDict
import pdb
pdb.set_trace()
def alphaDict(words):
alphas = OrderedDict()
words = sorted(words, key = str.lower)
words = filter(None, words);
for word in words:
if word[0].upper() not in alphas:
alphas[word[0].upper()] = []
alphas[word[0].upper()].append(word.lower())
return alphas
def listConvert(passage):
alphabets = " abcdefghijklmnopqrstuvwxyz"
for char in passage:
if char.lower() not in alphabets:
passage = passage.replace(char, "")
listConvert(passage)
passage = rDup(passage.split(" "))
return passage
def rDup(sequence):
unique = []
[unique.append(item) for item in sequence if item not in unique]
return unique
def otherPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from nltk.corpus import wordnet as wn
data = [s.definition() for s in wn.synsets(word)]
print("<li>")
print("<a href = '" +base+word+end+"' target = '_blank'><h2 class = 'dictlink'>" +(word.lower())+":</h2></a>")
if not data:
print("Sorry, we could not find this word in our data banks. Please click the word to check <a target = '_blank' class = 'dictlink' href = 'http://www.dictionary.com'>Dictionary.com</a>")
return
print("<ul>")
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
def formatPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from PyDictionary import PyDictionary
pd = PyDictionary()
data = pd.meaning(word)
print "<li>"
print "<a href = '" +base+word+end+"' target = '_blank'><h2 class = 'dictlink'>" +(word.lower())+":</h2></a>"
if not data:
print "Sorry, we could not find this word in our data banks. Please click the word to check <a target = '_blank' class = 'dictlink' href = 'http://www.dictionary.com'>Dictionary.com</a>"
return
print "<ol type = 'A'>"
for key in data:
print "<li><h3 style = 'color: red;' id = '" +word.lower()+ "'>"+key+"</h3><ul type = 'square'>"
for item in data[key]:
print "<li>" +item+"</li>"
print "</ul>"
print "</li>"
print "</ol>"
print "</li>"
def specPrint(words):
print "<ol>"
for word in words:
otherPrint(word)
print "</ol>"
print "<br/>"
print "<br/>"
print "<a href = '#list'> Click here</a> to go back to choose another letter<br/>"
print "<a href = '#sentence'>Click here</a> to view your sentence.<br/>"
print "<a href = '#header'>Click here</a> to see the owner's information.<br/>"
print "<a href = '../html/main.html'>Click here</a> to go back to the main page."
print "</div>"
for x in range(0, 10):
print "<br/>"
To all those who answered my previous question, thank you. It worked, I will be accepting an answer soon. However, I have another problem. When I try to import wordnet in a shell (by compiling and IDLE commands), the process works fine. However, on xampp, I get this error:
Can someone please explain this as well? Thanks!
Your for loop is not indented in other loop -
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
This is most probably the issue. Try indenting it-
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
Also, please understand that python treats tabs and spaces differently, so assuming you indent one line using tab and then next line using 4 spaces (manual spaces) it would cause indentation error in Python. You have to either use all spaces or all tabs , you cannot use a mixture of both (even though they look the same).
A couple of things. First is the indent of line one. That may just be copying here.
Then every time you have a colon, you need to have the next line indented. So in the otherPrint function you have this:
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
At least the first line needs to be indented. If you intend all of the prints to be in the loop then you need to indent all of them.
You also have the same issue with you if statements in formatPrint function. Try indenting them under the loops and conditionals and this should clear it up. If you are still finding a problem, then check to make sure you have the correct number of parentheses and brackets closing out statements. Leaving one off will cause the rest of the code to go wonky.
Also your are using print statements instead of the print() function. The print statement no longer works in Python 3.x... you have to enclose all of that in parentheses.
def formatPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from PyDictionary import PyDictionary
pd = PyDictionary()
data = pd.meaning(word)
print("<li>")
print(
"<a href = '" +base+word+end+"' target = '_blank'>
<h2 class = 'dictlink'>" +(word.lower())+":</h2></a>"
)
if not data:
print(
"Sorry, we could not find this word in our data banks.
Please click the word to check <a target = '_blank'
class = 'dictlink' href
='http://www.dictionary.com'>Dictionary.com</a>"
)
return
print("<ol type = 'A'>")
for key in data:
print(
"<li><h3 style = 'color: red;' id = '" +word.lower()+
"'>"+key+"</h3><ul type = 'square'>"
)
for item in data[key]:
print("<li>" +item+"</li>")
print("</ul>")
print("</li>")
print("</ol>")
print("</li>")

Get text from webpage as iterable object in python 3.3

Im trying to get the text from a webpage with Python 3.3 and then search through that text for certain strings. When I find a matching string I need to save the following text. For example I take this page: http://gatherer.wizards.com/Pages/Card/Details.aspx?name=Dark%20Prophecy
and I need to save the text after each category (card text, rarity, etc) in the card info.
Currently Im using beautiful Soup but get_text causes a UnicodeEncodeError and doesnt return an iterable object. Here is the relevant code:
urlStr = urllib.request.urlopen(
'http://gatherer.wizards.com/Pages/Card/Details.aspx?name=' + cardName
).read()
htmlRaw = BeautifulSoup(urlStr)
htmlText = htmlRaw.get_text
for line in htmlText:
line = line.strip()
if "Converted Mana Cost:" in line:
cmc = line.next()
message += "*Converted Mana Cost: " + cmc +"* \n\n"
elif "Types:" in line:
type = line.next()
message += "*Type: " + type +"* \n\n"
elif "Card Text:" in line:
rulesText = line.next()
message += "*Rules Text: " + rulesText +"* \n\n"
elif "Flavor Text:" in line:
flavor = line.next()
message += "*Flavor Text: " + flavor +"* \n\n"
elif "Rarity:" in line:
rarity == line.next()
message += "*Rarity: " + rarity +"* \n\n"
This is incorrect:
htmlText = htmlRaw.get_text
As get_text is a method of the BeautifulSoup class, you're assigning the method to htmlText and not its result. There is a property variant of it that will do what you want here:
htmlText = htmlRaw.text
You're also using a HTML parser to simply strip tags, when you could use it to target the data you want:
# unique id for the html section containing the card info
card_id = 'ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_rightCol'
# grab the html section with the card info
card_data = htmlRaw.find(id=card_id)
# create a generator to iterate over the rows
card_rows = ( row for row in card_data.find_all('div', 'row') )
# create a generator to produce functions for retrieving the values
card_rows_getters = ( lambda x: row.find('div', x).text.strip() for row in card_rows )
# create a generator to get the values
card_values = ( (get('label'), get('value')) for get in card_rows_getters )
# dump them into a dictionary
cards = dict( card_values )
print cards
{u'Artist:': u'Scott Chou',
u'Card Name:': u'Dark Prophecy',
u'Card Number:': u'93',
u'Card Text:': u'Whenever a creature you control dies, you draw a card and lose 1 life.',
u'Community Rating:': u'Community Rating: 3.617 / 5\xa0\xa0(64 votes)',
u'Converted Mana Cost:': u'3',
u'Expansion:': u'Magic 2014 Core Set',
u'Flavor Text:': u'When the bog ran short on small animals, Ekri turned to the surrounding farmlands.',
u'Mana Cost:': u'',
u'Rarity:': u'Rare',
u'Types:': u'Enchantment'}
Now you have a dictionary of the information you want (plus a few extra) which will be a lot easier to deal with.

How to scrape more than first instance of triple-nested list of links in Python?

I am trying to determine the simplest way to record the contents of webpages linked from webpages linked from an original webpage. I would like my output to be a table with rows corresponding to the contents of the third layer deep of pages.
As you can see from the code, I am currently only able to get the first instance of a desired item on the third-level page. Also, while my current code will return one row corresponding to each h2 item on the base URL, I hope to have multiple rows per h2 item (as many as there are instances of "span.'case-doc-details' a" on the second layer).
Some additional info: At each linking state, I do not know how many pages will be linked. I am using Python and Scraperwiki, and new to both. I have attempted to research the question, but have hit a roadblock in my knowledge of what to ask. Thanks in advance for any help.
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
record = {}
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
record['Count']=counter
table_cellsurls = table_cells[0].cssselect("a")
record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
title=caseroots.cssselect("title")
record['Title'] = title[0].text_content()
ids=caseroots.cssselect("div div div div a")
for i in ids:
if len(ids)<=2:
record['Rules']="None"
record['Treaty']="None"
else:
record['Rules']=ids[2].text_content()
record['Treaty']=ids[3].text_content()
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
#create another table element with rows, marked off with the case that they came from, create all the rows.
for i in pars:
if len(pars)==0:
record['DetailsURL']="None"
else:
record['DetailsURL']=pars[0].attrib.get('href')
pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
Here's the code I've got so far - this doesn't yet grab the documents link data (or save anything), but that should be a case of extending the principles here into another function:
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_page(linkurl):
html = scraperwiki.scrape(linkurl)
root = lxml.html.fromstring(html)
title = root.cssselect("h1")
print "the title:", title[0].text
record = {}
record['title'] = title[0].text
record['url'] = linkurl
#<div class="field-items"><div class="field-item even"><a
arbrules = root.cssselect("div.field-items a")
if arbrules:
record['arbruleurl'] = arbrules[0].attrib.get("href")
record['arbrule'] = arbrules[0].text_content()
else:
record['arbruleurl'] = "NO URL"
record['arbrule'] = "NO ARBRULE"
legalbasis = root.cssselect("div.field-label")
if legalbasis:
record['legalbasis'] = legalbasis[0].text_content()
else:
record['legalbasis'] = "NO LEGAL BASIS GIVEN"
extralinks = []
contents = root.cssselect("div.view-content a")
if contents:
for content in contents:
extralinks.append(content.text_content())
extralinks.append(content.attrib.get("href"))
record['extralinks'] = extralinks
else:
record['extralinks'] = "NO EXTRA LINKS"
#record['firstparty'] = title[0].text.split(" v. ")[0]
#record['secondparty'] = title[0].text.split(" v. ")[1]
#record['casenumber'] = title[0].text.split(" Case No.")[1]
print record
def scrape_table(root):
links = root.cssselect("div.link-wrapper a")
for link in links:
print link.text_content()
linkurl = link.attrib.get("href")
print linkurl
scrape_page('http://www.italaw.com'+linkurl)
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
Here is what I got to work for this problem.
A few instructive general points:
Use an if else loop to distinguish the situation of a length of zero from non-zero length of your key attribute.
Just before this, create your dictionary.
In both if and else components of the loop, give printing, storing and index augmentation instructions. You'll set your index to zero just before going into the loop.
In the else bit, create a for loop that iterates over each instance i, with they key attribute you want to iterate over set to record the ith instance. Set all other attributes to the zeroth instance.
Finally, when dealing with an arbitrary number of triple-nested links, it will generally be best to scrape all data (if possible) from the lowest level you are scraping. In my case, this worked, because all of the attributes I wanted to record were repeated on this level. In other cases, I am not sure what the best way to proceed would be.
Thanks to Paul for nudging this forward.
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
table_cellsurls = table_cells[0].cssselect("a")
#record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
record = {}
#create another table element with rows, marked off with the case that they came from, create all the rows.
if len(pars)==0:
record['DetailsURL']="None"
record['Count']=counter
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
else:
for i in range(0,len(pars)):
record['Count']=counter
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[i].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
record['DetailsURL']=pars[i].attrib.get('href')
title=caseroots2.cssselect("h2")
record['Title'] = title[1].text_content()
rules=caseroots2.cssselect("div.'field-name-field-arbitration-rules'")
if len(rules)==0:
record['Rules']="None"
else:
record['Rules']=rules[0].text_content()
treaty=caseroots2.cssselect("div.'field-name-field-case-treaties'")
if len(treaty)==0:
record['Treaty']="None"
else:
record['Treaty']=treaty[0].text_content()
pars2=caseroots2.cssselect("div.'field-name-field-case-document-date'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

Categories

Resources