Error in wikipedia subcategory crawling using python3

Error in wikipedia subcategory crawling using python3 - python

Hello Community Members,
I am getting the error NameError: name 'f' is not defined. The code is as follows. Please help. Any sort of help is appreciated. I have been strucked onto this since 3 days. The code is all about to extract all the subcategories name of wikipedia category in Python 3.
I have tried both the relative and absolute paths.
The code is as follows:
import httplib2
from bs4 import BeautifulSoup
import subprocess
import time, wget
import os, os.path
#declarations
catRoot = "http://en.wikipedia.org/wiki/Category:"
MAX_DEPTH = 100
done = []
ignore = []
path = 'trivial'
#Removes all newline characters and replaces with spaces
def removeNewLines(in_text):
return in_text.replace('\n', ' ')
# Downloads a link into the destination
def download(link, dest):
# print link
if not os.path.exists(dest) or os.path.getsize(dest) == 0:
subprocess.getoutput('wget "' + link + '" -O "' + dest+ '"')
print ("Downloading")
def ensureDir(f):
if not os.path.exists(f):
os.mkdir(f)
# Cleans a text by removing tags
def clean(in_text):
s_list = list(in_text)
i,j = 0,0
while i < len(s_list):
#iterate until a left-angle bracket is found
if s_list[i] == '<':
if s_list[i+1] == 'b' and s_list[i+2] == 'r' and s_list[i+3] == '>':
i=i+1
print ("hello")
continue
while s_list[i] != '>':
#pop everything from the the left-angle bracket until the right-angle bracket
s_list.pop(i)
#pops the right-angle bracket, too
s_list.pop(i)
elif s_list[i] == '\n':
s_list.pop(i)
else:
i=i+1
#convert the list back into text
join_char=''
return (join_char.join(s_list))#.replace("<br>","\n")
def getBullets(content):
mainSoup = BeautifulSoup(contents, "html.parser")
# Gets empty bullets
def getAllBullets(content):
mainSoup = BeautifulSoup(str(content), "html.parser")
subcategories = mainSoup.findAll('div',attrs={"class" : "CategoryTreeItem"})
empty = []
full = []
for x in subcategories:
subSoup = BeautifulSoup(str(x))
link = str(subSoup.findAll('a')[0])
if (str(x)).count("CategoryTreeEmptyBullet") > 0:
empty.append(clean(link).replace(" ","_"))
elif (str(x)).count("CategoryTreeBullet") > 0:
full.append(clean(link).replace(" ","_"))
return((empty,full))
def printTree(catName, count):
catName = catName.replace("\\'","'")
if count == MAX_DEPTH : return
download(catRoot+catName, path)
filepath = "categories/Category:"+catName+".html"
print(filepath)
content = open('filepath', 'w+')
content.readlines()
(emptyBullets,fullBullets) = getAllBullets(content)
f.close()
for x in emptyBullets:
for i in range(count):
print (" "),
download(catRoot+x, "categories/Category:"+x+".html")
print (x)
for x in fullBullets:
for i in range(count):
print (" "),
print (x)
if x in done:
print ("Done... "+x)
continue
done.append(x)
try: printTree(x, count + 1)
except:
print ("ERROR: " + x)
name = "Cricket"
printTree(name, 0)
The error encountered is as follows.

I think f.close() should be content.close().
It's common to use a context manager for such cases, though, like this:
with open(filepath, 'w+') as content:
(emptyBullets,fullBullets) = getAllBullets(content)
Then Python will close the file for you, even in case of an exception.
(I also changed 'filepath' to filepath, which I assume is the intent here.)

Related

Updates to text file are not being parsed using python

I'm parsing data from a text file ('placlog.txt') that is continuously being updated. As I run the code everything prints as expected, but if there are any updates to the placlog file while the code is running it is not printed.
The placlog file is being updated by a third-party program and I am using the above code to read the file and print any updates.
Once formatted, the text should be sent via a Telegram API. This part is also working initially.
import urllib.parse
import time
import requests
import os
def post_to_telegram(msg):
#print(msg)
base_url = 'https://api.telegram.org/bot&text="{}'.format(msg)
requests.get(base_url)
def check_url_inMsgList(stringToMatch, msgList):
for i in msgList:
if (stringToMatch in i):
return False
return True
try:
f = open("oldFile.txt", "r")
msgList = f.read().split("\n")
f.close()
except:
f = open("oldFile.txt", "w")
msgList = []
f.close()
selections = []
urr = ""
name = ""
pie = ""
ourLines = 2400
url_found = 0
name_found = 0
pie_found = 0
while (True):
file1 = open('placlog.txt', 'r')
Lines = file1.readlines()
file1.close()
while (True):
# print("-------------------------------")
if (ourLines == len(Lines)):
break
elif (ourLines > len(Lines)):
ourLines = 0
else:
txt = Lines[ourLines].strip()
tlist = txt.split("&")
ourLines = ourLines + 1
for subtxt in tlist:
if "eventurl=" in subtxt:
a = subtxt[9:len(subtxt) - 3]
url = "www.awebsite.com/%23" + a.replace("%23", "/")
#url = url.replace("%23", "#")
for i in range(10):
if "F" + str(i) + "/" in url:
url = url.split("F" + str(i) + "/")[0] + "F" + str(i) + "/"
urr = url
url_found = 1
elif "bit=" in subtxt:
name = urllib.parse.unquote(subtxt[4:len(subtxt)])
name_found = 1
elif "pie\":" in subtxt:
a = subtxt.split("price")[1]
pie = a.split("\"")[2]
pie = float(pie)
pie = round(pie, 1)
pie = str(pie)
pie_found = 1
selections.append(url + name + pie)
msg = (url + " " + name + " " + pie)
stringToFind = url + " " + name
if (check_url_inMsgList(stringToFind, msgList)):
post_to_telegram(msg)
msgList.append(msg)
print(msg)
f = open("oldFile.txt", "a+")
f.write(msg + "\n")
f.close()
time.sleep(0.5)
elif "minodds=" in subtxt:
a = subtxt.split("minodds=")[1]
pie = a.split("&")[0]
pie = float(pie)
rie = round(pie, 1)
pie = str(pie)
pie_found = 1
selections.append(url + name + pie)
msg = (url + " " + name + " " + pie)
stringToFind = url + " " + name
if (check_url_inMsgList(stringToFind, msgList)):
post_to_telegram(msg)
msgList.append(msg)
print(msg)
f = open("oldFile.txt", "a+")
f.write(msg + "\n")
f.close()
time.sleep(0.5)
time.sleep(1)

I would recommend using watchdog, and seeing if that helps your situation. It can monitor for file system changes, so you could define a function which is executed when the placlog.txt file is changed/updated.
A good guide can be found here: http://thepythoncorner.com/dev/how-to-create-a-watchdog-in-python-to-look-for-filesystem-changes/
From that guide, you can simply change the functions defined to suit your needs i.e.
def on_modified(event):
if event.src_path == "path/to/placlog.txt":
with open('placlog.txt', 'r') as placlog:
lines = file1.readlines()
Could you try this out and see if it helps? I still recommend the with statement for file i/o since you always want your file to close no matter what.
This link might also be useful since they are also monitoring a single .txt file: Python Watchdog - src_path inconsistent
watchdog documentation: https://pythonhosted.org/watchdog/
Note: Deleted the old answer since you clarified the question.

Python dictionary search not finding result given a string

I'm writing code for a project and it searches a text file for occurrences of a word on each line. When I use a example text file and search for a word it always prints out "No results for: " even if the word I searched for is in it. Did I setup the dictionary wrong or something?
# -*- coding: utf-8 -*-
"""
Created on Tue Nov 14 11:31:17 2017
#author: Ben Roux
"""
import re
from collections import Counter
stringinput = raw_input("Please enter a filename to open: ")
dictionary = {}
def openFile(stringinput):
try:
filevariable = open(stringinput, 'r')
return filevariable
except IOError:
print("Cannot Find File!")
def readData(stringinput):
filevariable = open(stringinput, 'r')
rawline = filevariable.readline()
line = 1
while (rawline !=""):
pl1 = rawline.replace(",","")
pl2 = pl1.replace("'","")
pl3 = pl2.replace("!","")
pl4 = pl3.replace("-"," ")
pl5 = pl4.replace(".","")
pl6 = re.sub('(\\b[A-Za-z] \\b|\\b [A-Za-z]\\b)', '', pl5)
pl7 = pl6.lower()
checkdictionary = sorted(Counter(pl7.split()).items())
for i in range(len(checkdictionary)):
if checkdictionary[i] in dictionary:
firstvalue = dictionary.get(checkdictionary[i])
newvalue = str(firstvalue) + ", " + str(line)
d1 = {checkdictionary[i]: newvalue}
dictionary.update(d1)
else:
d2 = {checkdictionary[i]: line}
dictionary.update(d2)
rawline = filevariable.readline()
line+=1
def processText(dictionary, searchkey):
if searchkey in dictionary:
print(str(searchkey) + " Appears On Lines: " + (str(dictionary[searchkey])))
else:
print("No results for: " + str(searchkey))
while (True):
try:
openFile(stringinput)
readData(stringinput)
searchkey = raw_input("Enter a keyword to search for: ")
processText(dictionary, searchkey)
break
except IOError:
break

#AK47's answer for changing the if else statement works and this also works:
checkdictionary = sorted(Counter(pl7.split()).items())
change to
checkdictionary = pl7.split()

Update this following code;
if checkdictionary[i][0] in dictionary:
firstvalue = dictionary.get(checkdictionary[i][0])
newvalue = str(firstvalue) + ", " + str(line)
d1 = {checkdictionary[i][0]: newvalue}
dictionary.update(d1)
else:
d2 = {checkdictionary[i][0]: line}
dictionary.update(d2)

how to merge same object in json file using python

1.json file contain many sniffing WIFI packets, I want get the mac address of receiver and transmitter which can be found in the first "wlan" object called "wlan.ra" and "wlan.sa". data[0] is the first WIFI packet.
Q1:
But when I try to print the elements of wlan after json load, it only show the elements of the second "wlan" object so there is no "wlan.ra" and "wlan.sa" in the data.
with open('1.json','r') as json_data:
data = json.load(json_data)
a=data[0]
print a
Q2:
There are two 'wlan' objects in my json file. How can I merge the elements in these two 'wlan' objects into just one 'wlan' object?
The following is my code, but it doesn't work:
with open('1.json','r') as f:
data=json.load(f)
for i in data:
i['_source']['layers']['wlan'].update()
Screenshot of json file:

'''
Created on 2017/10/3
#author: DD
'''
import os
def modify_jsonfile(jsonfile):
'''
replace wlan to wlan1/wlan2
'''
FILESUFFIX = '_new' # filename suffix
LBRACKET = '{' # json object delimiter
RBRACKET = '}'
INTERSETED = '"wlan"' # string to be replaced
nBrackets = 0 # stack to record object status
nextIndex = 1 # next index of wlan
with open(jsonfile, 'r') as fromJsonFile:
fields = os.path.splitext(jsonfile) # generate new filename
with open(fields[0] + FILESUFFIX + fields[1], 'w') as toJsonFile:
for line in fromJsonFile.readlines():
for ch in line: # record bracket
if ch == LBRACKET:
nBrackets += 1
elif ch == RBRACKET:
nBrackets -= 1
if nBrackets == 0:
nextIndex = 1
if (nextIndex == 1 or nextIndex == 2) and line.strip().find(INTERSETED) == 0: # replace string
line = line.replace(INTERSETED, INTERSETED[:-1] + str(nextIndex) + INTERSETED[-1])
nextIndex += 1
toJsonFile.write(line);
print 'done.'
if __name__ == '__main__':
jsonfile = r'C:\Users\DD\Desktop\1.json';
modify_jsonfile(jsonfile)

Python code is not returning the full value of an element in an array

The program is intended to display a map with pins showing the locations of institutions that make use of one of our facilities. The program takes a csv file, reads the postcodes, geocodes them and places the pins on the map. The size of the pins is relative to the number of times they have used the facility.
However, when the csv file is uploaded the program generates a map with all the pins over Nigeria. Looking through the output from the program, it seems to be geocoding correctly so I am not sure what is going on. The program geocodes using an offline database as python's urllib is not compatible with the proxy setup at my office.
The program is split into two separate modules, the map generation module and the geocoding module.
Here is the map generation part:
import folium
from bottle import route, run, template, static_file, request
import urllib.request
import urllib.parse
import json
import os
os.system('start geocoder.bat')
institutionList = []
map_osm = folium.Map(location=[55, -2], zoom_start=5)
#route('/spreadsheet.html')
def send_static():
return static_file('spreadsheet.html',root='')
#route('/upload', method='POST')
def do_upload():
category = request.forms.get('category')
upload = request.files.get('upload')
name, ext = os.path.splitext(upload.filename)
if ext not in ('.csv'):
return 'File extension not allowed.'
upload.save('uploads/' + upload.filename)
fileList = []
with open('spreadsheetList','r') as f:
while True:
line = f.readline()
if not line: break
print(line.strip())
print("line should have just printed")
fileList.append(line.strip())
f.close()
lengthFileList = len(fileList)
x = 0
while x < lengthFileList:
with open(('uploads/' + fileList[x]),'r') as spread:
while True:
line = spread.readline()
if not line: break
institutionDetails = line.split(',')
institutionList.append(institutionDetails)
spread.close()
x = x + 1
spreadsheetName = upload.filename
f = open('spreadsheetList','a')
f.write(spreadsheetName + '\n')
f.close()
with open('uploads/' + spreadsheetName, 'r') as f:
while True:
line = f.readline()
if not line: break
institutionDetails = line.split(',')
institutionList.append(institutionDetails)
print(institutionList)
f.close()
lengthOfList = len(institutionList)
x = 0
coords = []
while x < lengthOfList:
address = urllib.parse.quote_plus(institutionList[x][1])
response = urllib.request.urlopen('http://localhost:80/geoCodeRequest/' + address).read().decode('utf-8')
cleanResponse = str(response).replace('"','')
coords = cleanResponse
print(cleanResponse)
institutionList[x].append(coords)
x = x + 1
print("http sources successfully accessed")
print(institutionList)
x = 0
while x < lengthOfList:
try:
map_osm.circle_marker(location=institutionList[x][3], radius=(int(institutionList[x][2]) * 10),popup=institutionList[x][0], line_color='#3186cc',fill_color='#3186cc', fill_opacity=0.2)
print("marker added")
except:
print("marker could not be added")
x = x + 1
map_osm.create_map(path='osm.html')
return '<meta http-equiv="refresh" content="0; url=osm.html">'
#route('/osm.html')
def send_static():
return static_file('osm.html',root='')
run(host='localhost', port=8080)
A batch file is used to start the second module:
#echo off
python geocodeProxyBypass.py
Here is the second module of code, the geocoding module:
from bottle import route, run, template
import string
location = []
x = 0
#route('/geoCodeRequest/<name>')
def redir(name):
x = 0
print(name)
print(name[:4])
print(name[:3])
with open('ukPostcode.csv','r') as f:
while True:
line = f.readline()
print(line)
if not line: break
locationDetails = line.split(',')
location.append(locationDetails)
print(location[x][0])
if location[x][0] == ('"' + name[:4] + '"'):
coords = location[x][3] + ", " + location[x][4]
return coords
elif location[x][0] == ('"' + name[:3] + '"'):
coords = location[x][3] + ", " + location[x][4]
return ((coords.replace('"','')))
else:
print("no match found for " + name)
x = x + 1
f.close()
run(host='localhost', port=80)
Here is what an example pin generated by the program should look like:
var circle_1 = L.circle([51.74, -1.25
], 7460, {
color: '#3186cc',
fillColor: '#3186cc',
fillOpacity: 0.2
});
circle_1.bindPopup("University of Oxford");
circle_1._popup.options.maxWidth = 300;
map.addLayer(circle_1)
Here is what is actually being output:
var circle_1 = L.circle([5, 1
], 7460, {
color: '#3186cc',
fillColor: '#3186cc',
fillOpacity: 0.2
});
circle_1.bindPopup("University of Oxford");
circle_1._popup.options.maxWidth = 300;
map.addLayer(circle_1)
Apologies for the very long question, please help!

Python 3: Why am I getting an AttributeError?

I'm running Python 3 and I'm getting the following error:
AttributeError: 'AssemblyParser' object has no attribute 'hasMoreCommands'
Here is the code that is raising the error:
import sys
from Parser import AssemblyParser
from Code import Code
parser = AssemblyParser(sys.argv[1])
translator = Code()
out_file = str(sys.argv[1]).split(".")
out_file = str(out_file[:1]) + ".hack"
with open(out_file, 'w', encoding='utf-8') as f:
while parser.hasMoreCommands():
parser.advance()
if parser.commandType() == "A_COMMAND":
dec_num = parser.symbol()
binary = "{0:b}".format(dec_num)
elif parser.commandType() == "C_COMMAND":
default_bits = "111"
comp_bits += translator.comp(parser.comp())
dest_bits += translator.dest(parser.dest())
jump_bits += translator.jump(parser.jump())
binary = default_bits + comp_bits + dest_bits + jump_bits
assert len(binary) == 16
f.write(binary)
Here is my Parser.py file:
class AssemblyParser:
"""
Encapsulates access to the input code. Reads an assembly language command,
parses it, and provides convenient access to the command's components (fields and symbols).
In addition, removes all whitespace and comments.
"""
def __init__(self, input_file):
self.current_command = ""
self.next_command = ""
with open(input_file,"r+", encoding='utf-8') as f:
for l in f:
line = "".join(l.split()) # Remove whitespace from the line
line = line.split('//') # Removes any comments from the line
clean_line = line[0]
if clean_line.strip(): # Removes any blank lines
f.write(clean_line)
next_command = f.readline()
def __hasMoreCommands__(self):
if self.next_command:
return true
return false
def __advance__(self):
with open(input_file, encoding='utf-8') as f:
self.current_command = self.next_command
self.next_command = f.readline()
def __commandType__(self):
char_1 = self.current_command[:1]
if char_1 == "#":
return "A_COMMAND"
elif char_1 == "(":
return "L_COMMAND"
else:
return "C_COMMAND"
def __symbol__(self):
assert self.commandType() == ("A_COMMAND" or "L_COMMAND")
if self.commandType() == "A_COMMAND":
symbol = str(symbol[1:])
else:
symbol = str(symbol[1:len(symbol)-1])
return str(symbol)
def __dest__(self):
assert self.commandType() == "C_COMMAND"
if "=" in self.current_command:
temp = self.current_command.split("=")
return str(temp[:1])
else:
return ""
def __comp__(self):
assert self.commandType() == "C_COMMAND"
temp = self.current_command
if "=" in temp:
temp = temp.split("=")
temp = str(temp[1:])
if ";" in temp:
temp = temp.split(";")
temp = str(temp[:1])
return temp
def __jump__(self):
assert self.commandType() == "C_COMMAND"
if ";" in self.current_command:
temp = self.current_command.split(";")
return str(temp[1:])
else:
return ""
I really don't know why I'm getting this error, I've looked at the import documentation, but I'm getting more and more confused. I'm fairly new to Python. Can anyone explain this error?
Thanks.

Well. There seems to be no function in Parser module with name hasMoreCommand. The function in there starts with underscore and end eith underscore.

Two leading and trailing underscores are used to identify "magic" attributes. You can't use that to create your own, as they only reference pre-existing methods.
The following is what you probably want:
hasMoreCommands():
If you have multiple classes with this function, use name mangling instead:
_hasMoreCommands():
See: https://stackoverflow.com/a/8689983/2030480
And: http://www.rafekettler.com/magicmethods.html

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Error in wikipedia subcategory crawling using python3 - python

Related

Updates to text file are not being parsed using python

Python dictionary search not finding result given a string

how to merge same object in json file using python

Python code is not returning the full value of an element in an array

Python 3: Why am I getting an AttributeError?

Categories

Resources