Cannot download Wordnet Error - python

I am trying to compile this code:
from collections import OrderedDict
import pdb
pdb.set_trace()
def alphaDict(words):
alphas = OrderedDict()
words = sorted(words, key = str.lower)
words = filter(None, words);
for word in words:
if word[0].upper() not in alphas:
alphas[word[0].upper()] = []
alphas[word[0].upper()].append(word.lower())
return alphas
def listConvert(passage):
alphabets = " abcdefghijklmnopqrstuvwxyz"
for char in passage:
if char.lower() not in alphabets:
passage = passage.replace(char, "")
listConvert(passage)
passage = rDup(passage.split(" "))
return passage
def rDup(sequence):
unique = []
[unique.append(item) for item in sequence if item not in unique]
return unique
def otherPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from nltk.corpus import wordnet as wn
data = [s.definition() for s in wn.synsets(word)]
print("<li>")
print("<a href = '" +base+word+end+"' target = '_blank'><h2 class = 'dictlink'>" +(word.lower())+":</h2></a>")
if not data:
print("Sorry, we could not find this word in our data banks. Please click the word to check <a target = '_blank' class = 'dictlink' href = 'http://www.dictionary.com'>Dictionary.com</a>")
return
print("<ul>")
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
def formatPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from PyDictionary import PyDictionary
pd = PyDictionary()
data = pd.meaning(word)
print "<li>"
print "<a href = '" +base+word+end+"' target = '_blank'><h2 class = 'dictlink'>" +(word.lower())+":</h2></a>"
if not data:
print "Sorry, we could not find this word in our data banks. Please click the word to check <a target = '_blank' class = 'dictlink' href = 'http://www.dictionary.com'>Dictionary.com</a>"
return
print "<ol type = 'A'>"
for key in data:
print "<li><h3 style = 'color: red;' id = '" +word.lower()+ "'>"+key+"</h3><ul type = 'square'>"
for item in data[key]:
print "<li>" +item+"</li>"
print "</ul>"
print "</li>"
print "</ol>"
print "</li>"
def specPrint(words):
print "<ol>"
for word in words:
otherPrint(word)
print "</ol>"
print "<br/>"
print "<br/>"
print "<a href = '#list'> Click here</a> to go back to choose another letter<br/>"
print "<a href = '#sentence'>Click here</a> to view your sentence.<br/>"
print "<a href = '#header'>Click here</a> to see the owner's information.<br/>"
print "<a href = '../html/main.html'>Click here</a> to go back to the main page."
print "</div>"
for x in range(0, 10):
print "<br/>"
To all those who answered my previous question, thank you. It worked, I will be accepting an answer soon. However, I have another problem. When I try to import wordnet in a shell (by compiling and IDLE commands), the process works fine. However, on xampp, I get this error:
Can someone please explain this as well? Thanks!

Your for loop is not indented in other loop -
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
This is most probably the issue. Try indenting it-
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
Also, please understand that python treats tabs and spaces differently, so assuming you indent one line using tab and then next line using 4 spaces (manual spaces) it would cause indentation error in Python. You have to either use all spaces or all tabs , you cannot use a mixture of both (even though they look the same).

A couple of things. First is the indent of line one. That may just be copying here.
Then every time you have a colon, you need to have the next line indented. So in the otherPrint function you have this:
for key in data:
print("<li>"+key+"</li>")
print("</ul>")
print("</ol>")
print("</li>")
At least the first line needs to be indented. If you intend all of the prints to be in the loop then you need to indent all of them.
You also have the same issue with you if statements in formatPrint function. Try indenting them under the loops and conditionals and this should clear it up. If you are still finding a problem, then check to make sure you have the correct number of parentheses and brackets closing out statements. Leaving one off will cause the rest of the code to go wonky.
Also your are using print statements instead of the print() function. The print statement no longer works in Python 3.x... you have to enclose all of that in parentheses.
def formatPrint(word):
base = "http://dictionary.reference.com/browse/"
end = "?s=t"
from PyDictionary import PyDictionary
pd = PyDictionary()
data = pd.meaning(word)
print("<li>")
print(
"<a href = '" +base+word+end+"' target = '_blank'>
<h2 class = 'dictlink'>" +(word.lower())+":</h2></a>"
)
if not data:
print(
"Sorry, we could not find this word in our data banks.
Please click the word to check <a target = '_blank'
class = 'dictlink' href
='http://www.dictionary.com'>Dictionary.com</a>"
)
return
print("<ol type = 'A'>")
for key in data:
print(
"<li><h3 style = 'color: red;' id = '" +word.lower()+
"'>"+key+"</h3><ul type = 'square'>"
)
for item in data[key]:
print("<li>" +item+"</li>")
print("</ul>")
print("</li>")
print("</ol>")
print("</li>")

Related

Search multiple words (dependant) from pdf

I happily found python code to search for multiple words in pdf.
I wanted to look for the pages where two words exist. For instance, I want both 'Name' and 'Address' to exist in the same page, that give the page location where this occur. If either one word is available, then the page location is not required.
Thank you.
Code that I found:
Search Multiple words from pdf
Refereing to the page cited by the author and from what I found here, I would suggest something like:
def string_found(word, string_page):
if re.search(r"\b" + re.escape(word) + r"\b", string_page,re.IGNORECASE):
return True
return False
word1 = "name"
word2 = "adress"
for i in range(0, num_pages):
page = object.getPage(i)
text = page.extractText() # get text of current page
bool1 = string_found(word1, text)
bool2 = string_found(word2, text)
if bool1 and bool2:
print(i) # print number of page with both occurences

Scraping Yahoo Finance Balance Sheet with Python

My question is a follow-up question to one asked here.
The function:
periodic_figure_values()
seems to work fine except in the case where the name of a line item being searched appears twice. The specific case I am referring to is trying to get data for "Long Term Debt". The function in the link above will return the following error:
Traceback (most recent call last):
File "test.py", line 31, in <module>
LongTermDebt=(periodic_figure_values(soup, "Long Term Debt"))
File "test.py", line 21, in periodic_figure_values
value = int(str_value)
ValueError: invalid literal for int() with base 10: 'Short/Current Long Term Debt'
because it seems to get tripped up on "Short/Current Long Term Debt". You see, the page has both "Short/Current Long Term Debt" and "Long Term Debt". You can see an example of the source page using Apple's balance sheet here.
I'm trying to find a way for the function to return data for "Long Term Debt" without getting tripped up on "Short/Current Long Term Debt".
Here is the function and an example that fetches "Cash and Cash Equivalents", which works fine, and "Long Term Debt", which does not work:
import requests, bs4, re
def periodic_figure_values(soup, yahoo_figure):
values = []
pattern = re.compile(yahoo_figure)
title = soup.find("strong", text=pattern) # works for the figures printed in bold
if title:
row = title.parent.parent
else:
title = soup.find("td", text=pattern) # works for any other available figure
if title:
row = title.parent
else:
sys.exit("Invalid figure '" + yahoo_figure + "' passed.")
cells = row.find_all("td")[1:] # exclude the <td> with figure name
for cell in cells:
if cell.text.strip() != yahoo_figure: # needed because some figures are indented
str_value = cell.text.strip().replace(",", "").replace("(", "-").replace(")", "")
if str_value == "-":
str_value = 0
value = int(str_value)
values.append(value)
return values
res = requests.get('https://ca.finance.yahoo.com/q/bs?s=AAPL')
res.raise_for_status
soup = bs4.BeautifulSoup(res.text, 'html.parser')
Cash=(periodic_figure_values(soup, "Cash And Cash Equivalents"))
print(Cash)
LongTermDebt=(periodic_figure_values(soup, "Long Term Debt"))
print(LongTermDebt)
The easiest would be to use a try/except combination using the raised ValueError:
import requests, bs4, re
def periodic_figure_values(soup, yahoo_figure):
values = []
pattern = re.compile(yahoo_figure)
title = soup.find("strong", text=pattern) # works for the figures printed in bold
if title:
row = title.parent.parent
else:
title = soup.find("td", text=pattern) # works for any other available figure
if title:
row = title.parent
else:
sys.exit("Invalid figure '" + yahoo_figure + "' passed.")
cells = row.find_all("td")[1:] # exclude the <td> with figure name
for cell in cells:
if cell.text.strip() != yahoo_figure: # needed because some figures are indented
str_value = cell.text.strip().replace(",", "").replace("(", "-").replace(")", "")
if str_value == "-":
str_value = 0
### from here
try:
value = int(str_value)
values.append(value)
except ValueError:
continue
### to here
return values
res = requests.get('https://ca.finance.yahoo.com/q/bs?s=AAPL')
res.raise_for_status
soup = bs4.BeautifulSoup(res.text, 'html.parser')
Cash=(periodic_figure_values(soup, "Cash And Cash Equivalents"))
print(Cash)
LongTermDebt=(periodic_figure_values(soup, "Long Term Debt"))
print(LongTermDebt)
This one prints out your numbers quite fine.
Note, that you do not really need the re module in this situation here as you're checking for literals only (no wildcards, no boundaries), etc.
You could change the function so that it accepts a regular expression instead of a plain string. Then you can search for ^Long Term Debt to make sure there's no text before that. All you need to do is to change
if cell.text.strip() != yahoo_figure:
to
if not re.match(yahoo_figure, cell.text.strip()):

How to find a value for a specific key in python

import requests
import json
# initial message
message = "if i can\'t let it go out of my mind"
# split into list
split_message = message.split()
def decrementList(words):
for w in [words] + [words[:-x] for x in range(1,len(words))]:
url = 'http://ws.spotify.com/search/1/track.json?q='
request = requests.get(url + "%20".join(w))
json_dict = json.loads(request.content)
num_results = json_dict['info']['num_results']
if num_results > 0:
num_removed = len(words) - len(w)
#track_title = ' '.join(words)
track_title = "If I Can't Take It with Me"
for value in json_dict.items():
if value == track_title:
print "match found"
return num_removed, json_dict
num_words_removed, json_dict = decrementList(split_message)
In the code below, I am trying to match the name of a song to my search query. In this particular query, the song will not match, but I have added a variable that will match the song for the returned query. The for loop at the end of the function is supposed to match the track title variable, but I can't figure out why it isn't working. Is there a simple way to find all values for a known key? In this case, the key is "name"
You have to search for the track title, within the tracks dictionary. So, just change your code like this
for value in json_dict["tracks"]:
if value["name"] == track_title:
it would print
match found

Recursive function gives no output

I'm scraping all the URL of my domain with recursive function.
But it outputs nothing, without any error.
#usr/bin/python
from bs4 import BeautifulSoup
import requests
import tldextract
def scrape(url):
for links in url:
main_domain = tldextract.extract(links)
r = requests.get(links)
data = r.text
soup = BeautifulSoup(data)
for href in soup.find_all('a'):
href = href.get('href')
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == main_domain.domain :
problem.append(href)
elif not href == '#' and link_domain.tld == '':
new = 'http://www.'+ main_domain.domain + '.' + main_domain.tld + '/' + href
problem.append(new)
return len(problem)
return scrape(problem)
problem = ["http://xyzdomain.com"]
print(scrape(problem))
When I create a new list, it works, but I don't want to make a list every time for every loop.
You need to structure your code so that it meets the pattern for recursion as your current code doesn't - you also should not call variables the same name as libraries, e.g. href = href.get() because this will usually stop the library working as it becomes the variable, your code as it currently is will only ever return the len() as this return is unconditionally reached before: return scrap(problem).:
def Recursive(Factorable_problem)
if Factorable_problem is Simplest_Case:
return AnswerToSimplestCase
else:
return Rule_For_Generating_From_Simpler_Case(Recursive(Simpler_Case))
for example:
def Factorial(n):
""" Recursively Generate Factorials """
if n < 2:
return 1
else:
return n * Factorial(n-1)
Hello I've made a none recursive version of this that appears to get all the links on the same domain.
The code below I've tested using the problem included in the code. When I'd solved the problems with the recursive version the next problem was hitting the recursion depth limit so I rewrote it so it ran in an iterative fashion, the code and result below:
from bs4 import BeautifulSoup
import requests
import tldextract
def print_domain_info(d):
print "Main Domain:{0} \nSub Domain:{1} \nSuffix:{2}".format(d.domain,d.subdomain,d.suffix)
SEARCHED_URLS = []
problem = [ "http://Noelkd.neocities.org/", "http://youpi.neocities.org/"]
while problem:
# Get a link from the stack of links
link = problem.pop()
# Check we haven't been to this address before
if link in SEARCHED_URLS:
continue
# We don't want to come back here again after this point
SEARCHED_URLS.append(link)
# Try and get the website
try:
req = requests.get(link)
except:
# If its not working i don't care for it
print "borked website found: {0}".format(link)
continue
# Now we get to this point worth printing something
print "Trying to parse:{0}".format(link)
print "Status Code:{0} Thats: {1}".format(req.status_code, "A-OK" if req.status_code == 200 else "SOMTHINGS UP" )
# Get the domain info
dInfo = tldextract.extract(link)
print_domain_info(dInfo)
# I like utf-8
data = req.text.encode("utf-8")
print "Lenght Of Data Retrived:{0}".format(len(data)) # More info
soup = BeautifulSoup(data) # This was here before so i left it.
print "Found {0} link{1}".format(len(soup.find_all('a')),"s" if len(soup.find_all('a')) > 1 else "")
FOUND_THIS_ITERATION = [] # Getting the same links over and over was boring
found_links = [x for x in soup.find_all('a') if x.get('href') not in SEARCHED_URLS] # Find me all the links i don't got
for href in found_links:
href = href.get('href') # You wrote this seems to work well
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == dInfo.domain: # JUST FINDING STUFF ON SAME DOMAIN RIGHT?!
if href not in FOUND_THIS_ITERATION: # I'ma check you out next time
print "Check out this link: {0}".format(href)
print_domain_info(link_domain)
FOUND_THIS_ITERATION.append(href)
problem.append(href)
else: # I got you already
print "DUPE LINK!"
else:
print "Not on same domain moving on"
# Count down
print "We have {0} more sites to search".format(len(problem))
if problem:
continue
else:
print "Its been fun"
print "Lets see the URLS we've visited:"
for url in SEARCHED_URLS:
print url
Which prints, after a lot of other logging loads of neocities websites!
What's happening is the script is popping a value of the list of websites yet to visit, it then gets all the links on the page which are on the same domain. If those links are to pages we haven't visited we add the link to the list of links to be visited. After we do that we pop the next page and do the same thing again until there are no pages left to visit.
Think this is what your looking for, get back to us in the comments if this doesn't work in the way that you want or if anyone can improve please leave a comment.

How to scrape more than first instance of triple-nested list of links in Python?

I am trying to determine the simplest way to record the contents of webpages linked from webpages linked from an original webpage. I would like my output to be a table with rows corresponding to the contents of the third layer deep of pages.
As you can see from the code, I am currently only able to get the first instance of a desired item on the third-level page. Also, while my current code will return one row corresponding to each h2 item on the base URL, I hope to have multiple rows per h2 item (as many as there are instances of "span.'case-doc-details' a" on the second layer).
Some additional info: At each linking state, I do not know how many pages will be linked. I am using Python and Scraperwiki, and new to both. I have attempted to research the question, but have hit a roadblock in my knowledge of what to ask. Thanks in advance for any help.
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
record = {}
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
record['Count']=counter
table_cellsurls = table_cells[0].cssselect("a")
record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
title=caseroots.cssselect("title")
record['Title'] = title[0].text_content()
ids=caseroots.cssselect("div div div div a")
for i in ids:
if len(ids)<=2:
record['Rules']="None"
record['Treaty']="None"
else:
record['Rules']=ids[2].text_content()
record['Treaty']=ids[3].text_content()
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[0].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
#create another table element with rows, marked off with the case that they came from, create all the rows.
for i in pars:
if len(pars)==0:
record['DetailsURL']="None"
else:
record['DetailsURL']=pars[0].attrib.get('href')
pars2=caseroots2.cssselect("div.'field-item even' span.'date-display-single'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
Here's the code I've got so far - this doesn't yet grab the documents link data (or save anything), but that should be a case of extending the principles here into another function:
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_page(linkurl):
html = scraperwiki.scrape(linkurl)
root = lxml.html.fromstring(html)
title = root.cssselect("h1")
print "the title:", title[0].text
record = {}
record['title'] = title[0].text
record['url'] = linkurl
#<div class="field-items"><div class="field-item even"><a
arbrules = root.cssselect("div.field-items a")
if arbrules:
record['arbruleurl'] = arbrules[0].attrib.get("href")
record['arbrule'] = arbrules[0].text_content()
else:
record['arbruleurl'] = "NO URL"
record['arbrule'] = "NO ARBRULE"
legalbasis = root.cssselect("div.field-label")
if legalbasis:
record['legalbasis'] = legalbasis[0].text_content()
else:
record['legalbasis'] = "NO LEGAL BASIS GIVEN"
extralinks = []
contents = root.cssselect("div.view-content a")
if contents:
for content in contents:
extralinks.append(content.text_content())
extralinks.append(content.attrib.get("href"))
record['extralinks'] = extralinks
else:
record['extralinks'] = "NO EXTRA LINKS"
#record['firstparty'] = title[0].text.split(" v. ")[0]
#record['secondparty'] = title[0].text.split(" v. ")[1]
#record['casenumber'] = title[0].text.split(" Case No.")[1]
print record
def scrape_table(root):
links = root.cssselect("div.link-wrapper a")
for link in links:
print link.text_content()
linkurl = link.attrib.get("href")
print linkurl
scrape_page('http://www.italaw.com'+linkurl)
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)
Here is what I got to work for this problem.
A few instructive general points:
Use an if else loop to distinguish the situation of a length of zero from non-zero length of your key attribute.
Just before this, create your dictionary.
In both if and else components of the loop, give printing, storing and index augmentation instructions. You'll set your index to zero just before going into the loop.
In the else bit, create a for loop that iterates over each instance i, with they key attribute you want to iterate over set to record the ith instance. Set all other attributes to the zeroth instance.
Finally, when dealing with an arbitrary number of triple-nested links, it will generally be best to scrape all data (if possible) from the lowest level you are scraping. In my case, this worked, because all of the attributes I wanted to record were repeated on this level. In other cases, I am not sure what the best way to proceed would be.
Thanks to Paul for nudging this forward.
import scraperwiki
import urlparse
import lxml.html
import urllib
def scrape_table(root):
rows = root.cssselect("h2")
counter=0
for row in rows:
table_cells = row.cssselect("h2 a")
for cell in table_cells:
table_cellsurls = table_cells[0].cssselect("a")
#record['CaseURL'] = table_cellsurls[0].attrib.get('href')
caselinkurl = urllib.urlopen('http://www.italaw.com/'+table_cellsurls[0].attrib.get('href')).read()
#print caselinkurl
caseroots = lxml.html.fromstring(caselinkurl)
pars = caseroots.cssselect("span.'case-doc-details' a")
#print "pars length is", len(pars)
record = {}
#create another table element with rows, marked off with the case that they came from, create all the rows.
if len(pars)==0:
record['DetailsURL']="None"
record['Count']=counter
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
else:
for i in range(0,len(pars)):
record['Count']=counter
caselinkurl2=urllib.urlopen('http://www.italaw.com/'+pars[i].attrib.get('href')).read()
caseroots2=lxml.html.fromstring(caselinkurl2)
record['DetailsURL']=pars[i].attrib.get('href')
title=caseroots2.cssselect("h2")
record['Title'] = title[1].text_content()
rules=caseroots2.cssselect("div.'field-name-field-arbitration-rules'")
if len(rules)==0:
record['Rules']="None"
else:
record['Rules']=rules[0].text_content()
treaty=caseroots2.cssselect("div.'field-name-field-case-treaties'")
if len(treaty)==0:
record['Treaty']="None"
else:
record['Treaty']=treaty[0].text_content()
pars2=caseroots2.cssselect("div.'field-name-field-case-document-date'")
if len(pars2)==0:
record['Doc Date']="None"
else:
record['Doc Date']=pars2[0].text_content()
pars3=caseroots2.cssselect("div.'field-name-field-case-doc-file' span.'file' a")
if len(pars3) ==0:
record['Doc Type Link']="None"
record['Doc Type']="None"
else:
record['Doc Type Link']=pars3[0].attrib.get('href')
record['Doc Type']=pars3[0].text_content()
pars4=caseroots2.cssselect("div.'field-name-field-arbitrator-claimant'")
if len(pars4)==0:
record['Claimant Nominee']="None"
else:
record['Claimant Nominee']=pars4[0].text_content()
pars5=caseroots2.cssselect("div.'field-name-field-arbitrator-respondent'")
if len(pars5)==0:
record['Respondent Nominee']="None"
else:
record['Respondent Nominee']=pars5[0].text_content()
pars6=caseroots2.cssselect("div.'field-name-field-arbitrator-chair'")
if len(pars6)==0:
record['President']="None"
else:
record['President']=pars6[0].text_content()
print record, '------------'
scraperwiki.sqlite.save(['Count'],record)
counter+=1
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
print html
root = lxml.html.fromstring(html)
scrape_table(root)
#START HERE:
url = 'http://www.italaw.com/cases-by-respondent?field_case_respondent_tid=All'
scrape_and_look_for_next_link(url)

Categories

Resources