I am attempting to scrape info from few hundred thousand files I am reading from disk and shove it into a SQLite database. I would like to be able to stop reading the files (i.e. crash or user interrupt) and have it pick up reading the files where it left off. Meaning don't start over with file number one every time. I've started working from a code sample that is helpful, but assumes you have sequentially numbered files delivered in sequential order. My issues:
I understand glob returns files arbitrarily
Files in my directory are similar in name not sequential
File names in directory might look like:
249959 Run Data Email D.eml
250000 Returned mail s.eml
250002 Warning could n.eml
Here is where I am code-wise:
cur.executescript('''
CREATE TABLE IF NOT EXISTS MailSubject (id INTEGER UNIQUE, subject TEXT)''')
# Pick up where we left off
start = None
cur.execute('SELECT max(id) FROM MailSubject' )
try:
row = cur.fetchone()
if row is None :
start = 0
else:
start = row[0]
except:
start = 0
if start is None : start = 0
# Number user wants to grab
many = 0
# Number processed this go
count = 0
fail = 0
while True:
if ( many < 1 ) :
conn.commit()
sval = input('How many messages:')
if ( len(sval) < 1 ) : break
many = int(sval)
start = start + 1
cur.execute('SELECT id FROM MailSubject WHERE id=?', (start,) )
try:
row = cur.fetchone()
if row is not None : continue
except:
row = None
many = many - 1
print ("Many:", many)
# This is where you would define URL or file name to open
# below works for URL with sequential file names
# url = baseurl + str(start) + '/' + str(start + 1)
text = "None"
try:
os.chdir("INBOX-200")
# This is how I open files now, no start / stop
# Files are not sequentially numbered
# Glob retrieves items in arbitrary order
for file in glob.glob("*.eml"):
try:
with open(file, 'r') as f:
text = f.read()
except KeyboardInterrupt:
print('')
print('Program interrupted by user...')
break
except:
print('XXX File cannot be opened:', file)
fail = fail + 1
if fail > 5 : break
break
# Must have succeed, increase qty processed this round
count = count + 1
Thanks for your assistance.
Related
First of all I should inform you that I have very little experience in programming. And I have some trouble with the logic and flow of a general webscraper implemented in python. I assume that I should use callbacks and similar methods in order to properly control the process of saving pages from a javascript e-book reader. My script does work, but not consistently. If someone could advice me on improvements that should be made to this script, that would be great. Thank you.
from seleniumwire.utils import decode as sdecode
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options # [!]
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import time
import os.path
opts = Options() # [!]
opts.add_experimental_option('w3c', True) # [!]
capabilities = DesiredCapabilities.CHROME.copy()
driver = webdriver.Chrome(chrome_options=opts, desired_capabilities=capabilities)
url = ' here comes url'
driver.get(url)
def get_requests():
l = []
for rx in driver.requests:
#endmark = '&scale=2&rotate=0' lenght must be 17
if rx.url[-17:]==endmark:
l.append(rx.url)
return list(set(l))
def savepages(diff):
newpages = 0
for urlitem in diff:
for request in driver.requests:
if request.url==urlitem:
#print(request.url)
ind = urlitem.find('.jp2&id') # ex. 0012.jp2&id
file_path = directory_path + '\\' + file_name + urlitem[ind-4:ind] + '.jpg'
tik = 0
while tik<10: #waiting for the response body data
try:
tik += 1
data = sdecode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))
except AttributeError: # no data error
time.sleep(2) # wait for 2 sec for the data
continue
#data = data.decode("utf-8",'ignore')
# sometimes I get this error 'UnboundLocalError: local variable 'data' referenced before assignment'
# I assumed that the following condition will help but it doesn't seem to work consistently
if data:
with open(file_path, 'wb') as outfile:
outfile.write(data) # sometimes I get UnboundLocalError
else: print('no data')
# was the file saved or not
if os.path.exists(file_path):
newpages += 1 # smth is wrong with the counting logic, since pages+newpages should be equal to the lenght of li=get_requests(), I get more
else:
time.sleep(.5)
return newpages
count = 0 # a counter, should terminate the main delay loop
pages = 0 # counting all saved pages; book pages or images are equivalent, one turn should open 2 new pages/images/requests
oldli = [] #compare to the new list after each delay cycle
turns = 0 #count how many turns have been made or how many times we clicked on the button Next Page
li = get_requests() # get all unique requests of the images/pages, some requests might be still loading, but we manually opened the first page and visually confirmed that there are at least 1 or 3 images/requests
if li: # the program STARTS HERE, first try, there are some requests because we manually opened the first page
# THE MAIN CYCLE should stop when the delay is too long and we turned all the pages of the book
while 2*turns+1<len(li) or count<15: # should terminate the whole program when there is no more images coming
count = 0 #reset counter
success = False #reset success; new pages downloaded successfully
# the main delay counter
# what happens if diff is [] and no success
while True:
count += 1
if count > 14:
print('Time out after more than 10 seconds.')
break
li = get_requests() # in addition, I assume that all requests counting from page 1 will be kept
# it is possible that li will not have some of the old requests and oldli will be longer
# well, I need to keep all old requests in a separate list and then append to it
diff = list(set(li)-set(oldli)) # find new requests after the delay
if diff: # there are some new
npages = savepages(diff) # saves new images and returns the number of them
print('newpages ',npages, ' len diff ', len(diff)) # should be equal
if npages >= len(diff)-1: # we allow one request without a body with data ??
pages += npages # smth is not ok here, the number of pages sometimes exceeds the length of li
success = True # we call it a success
else:
print('Could not save pages. Newpages ', npages, ' len diff ', len(diff))
for pg in diff:
print(pg) # for debuging purposes
break # in this case you break from the delay cycle
else: time.sleep(2) # if no new requests add 2 sec to the waiting time
if success: # we turn pages in case of successful download, this is bad if we need to catch up
while 2*turns+1 < len(li): # if some of old requests are deleted then the program will stop earlier
# it won't wait for the bodies of requests, there is a problem
driver.find_elements(By.CLASS_NAME, "BRicon.book_right.book_flip_next")[0].click()
turns += 1
time.sleep(3) # I got the impression that this doesn't happen
oldli = li
print('pages ',pages,' length of list ',len(li))
break # we break from the delay cycle since success
time.sleep(2) # the main delay timer;; plus no diff timer = total time
else: print('no requests in the list to process') ```
I am working on a Python script that reads data from a big excel file and stores the necessary data on corresponding dictionary variables.
Once all of the file's data is read and processed, the script iterates through all the dictionaries by 100 records and calls on a function that will insert them to a table using the executemany function.
The thing is, there are some records that are NOT being inserted to a specific table, which means there are more tables that are not inserting properly, even though I see the data is being passed.
Ex: record ID: IH1-01-01 is taken into consideration to be inserted but it doesn't in the end.
I don't see any fatal errors or anything that points to this mishap happening.
What else could be causing these records to not be inserted?
Here's part of the code for your review
#function that doesn't work properly
def InsertarAnaquelPisoLetraMasivo(queryAnaquel):
try:
query = "INSERT INTO Anaquel(anaq_id,anaq_nombre,anaq_piso, anaq_letra, anaq_cantidad,anaq_movimiento, anaq_almacen) VALUES (NULL,%s, %s, %s, %s, %s, 'Mex')"
cursor.executemany(query, queryAnaquel)
#added so I can see what data was being passed
#I can confirm all records are there but don't get inserted for some reason
for line in queryAnaquel:
print line
except:
print ("Error ", sys.exc_info()[0])
#iterate through excel file
for row in range(2, sheet.max_row + 1):
#...
#read item's location
ubicacion = sheet['H' + str(row)].value if sheet['H' + str(row)].value is not None else ""
if revisarSiLlaveEsConsiderada(ubicaciones, ubicacion):
ubicaciones[ubicacion] += 1
else:
ubicaciones[ubicacion] = 1
#part that processes data and calls for insert
print "Configurando anaqueles"
for key in ubicaciones:
ubicacionSplit = key.split("-")
tipoAnaquel = "Alto"
if len(ubicacionSplit) > 2:
tipoAnaquel = "Isla"
pis = ubicacionSplit[1]
let = ubicacionSplit[2]
elif len(ubicacionSplit)==1:
pis = ''
let = ''
else:
pisoLe = ubicacionSplit[1]
if len(pisoLe) >1:
if len(pisoLe) == 2:
pis = pisoLe[0]
let = pisoLe[1]
else:
pis = pisoLe
let = ''
else:
pis = pisoLe
let = ''
ubs = ubicacionSplit[0]
cantidad = ubicaciones[key]
detalleAnaquel = detalleUbicacion(ubs, pis, let)
if detalleAnaquel == "":
#Here's where to look
value = (ubs,pis, let, cantidad, tipoAnaquel)
queryAnaquel.append(value)
if cuentaQueryAnaquel == 100:
InsertarAnaquelPisoLetraMasivo(queryAnaquel)
cuentaQueryAnaquel = 1
queryAnaquel = []
else:
cuentaQueryAnaquel += 1
else:
idAnaquel = detalleAnaquel
updateAnaquel(idAnaquel, cantidad)
#In case we haven't reached 100 after iterating through all recordds
if len(queryAnaquel) > 0:
InsertarAnaquelPisoLetraMasivo(queryAnaquel)
queryAnaquel = []
cuentaQueryAnaquel=1
Table's config:
CREATE TABLE IF NOT EXISTS `Anaquel` (
`anaq_id` int(11) NOT NULL AUTO_INCREMENT,
`anaq_nombre` varchar(50) NOT NULL,
`anaq_movimiento` enum('Alto','Mediano','Bajo','Caja','Isla','') NOT NULL DEFAULT 'Bajo',
`anaq_piso` varchar(20) NOT NULL,
`anaq_letra` varchar(1) NOT NULL,
`anaq_cantidad` int(11) NOT NULL,
`anaq_capacidad` int(1) NOT NULL,
`anaq_almacen` enum('Mex','Usa') NOT NULL,
PRIMARY KEY (`anaq_id`),
KEY `anaq_nombre` (`anaq_nombre`)
) ENGINE=InnoDB AUTO_INCREMENT=26701 DEFAULT CHARSET=latin1;
Problem data, as was printed by command prompt:
(u'IH1', u'1', u'4', 38, 'Isla')
Edit: I previously removed warnings on the script and I just commented out the call on the warning removal.
I see a lot of warnings that say something like this:
C:\Python27\lib\site-packages\MySQLdb\cursors.py:206: Warning: Incorrect string value: '\xCC\x81s qu...' for column 'booName_title' at row 1
r = r + self.execute(query, a)
I'm having problem with nested for loop (for doc in query) that is ran only once. It's inside for item in news_items which I have verified iterates 10 times, and the for doc in query loop should iterate 9 times. When I'm printing doc, it prints 9 documents, however as I'm trying to make if / else check on the document's content, it only happens to run one time. (I would expect 9 x 10 outputs since it's checking item from parent, to doc in query but all I get is 9 outputs).
I've tried to look on stack but nothing I found seems to be relevant, from other programing languages I work with I don't see why this wouldn't work but maybe I'm missing something since I'm fairly new to Python (1 week).
def scrape(url):
# GET DATE AT THE TIME OF CRAWL START
today = date.today()
d1 = today.strftime("%d/%m/%Y")
# D2 is used for query only
d2 = today.strftime("%Y%m%d")
# LOAD URL IN DRIVER
driver.get(url)
try:
news_container = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.CLASS_NAME, "FlashNews-Box-Root"))
)
# array of items
news_items = news_container.find_elements_by_class_name("FlashNews-Box-Item")
refresher_ref = db.collection(u'news').document('sources').collection('refresher_news')
# query for last article
query = refresher_ref.order_by(u'article_timestamp', direction=firestore.Query.DESCENDING).limit(10).stream()
for item in news_items:
print("News items found: " + str(len(news_items)))
try:
# image is optional so we need to try it
try:
item_image = item.find_element_by_class_name("FlashNews-Box-ItemImage").find_element_by_tag_name(
"img").get_attribute("src")
except Exception as e:
item_image = "unavailable"
# time will be added to the same day as when this was ran, since this will run often and compare
# article texts, we won't have issue with wrong dates
item_time = item.find_element_by_class_name("FlashNews-Box-ItemTime").text + " " + d1
item_time_query_temp = item.find_element_by_class_name("FlashNews-Box-ItemTime").text.replace(":", "")
# normalize timestamp for sorting
if len(item_time_query_temp) == 3:
item_time_query_temp = "0" + item_time_query_temp
item_time_query = d2 + item_time_query_temp
item_text = item.find_element_by_class_name("FlashNews-Box-ItemText").text
item_redirect = item.find_element_by_class_name("FlashNews-Box-ItemText").find_element_by_tag_name(
"a").get_attribute("href")
result = {"article_time": item_time, "article_url": item_redirect, "article_image": item_image,
"article_text": item_text, "article_timestamp": item_time_query}
# print(result)
# save data to firestore - check for last item in firestore, then add this article
is_new = True
print("Printing 10x")
# THIS EXECUTES ONLY ONCE?
for doc in query:
# print(str(len(query)))
current_doc = doc.to_dict()
# print(current_doc)
# print(current_doc)
# print("Iteration: " + current_doc['article_text'])
# print("Old: " + current_doc["article_text"] + " New: " + item_text)
if current_doc['article_text'] == item_text:
print("Match")
# print(current_doc['article_text'] + item_text)
# print("Old: " + current_doc['article_text'] + " New: " + item_text)
else:
print("Mismatch")
# print(current_doc['article_text'] + item_text)
# print("Skipping article as the text exists in last 10")
# else:
# print("Old: " + current_doc['article_text'] + " New: " + item_text)
# print(str(is_new))
# if is_new:
# refresher_ref.add(result)
# print("Adding document")
except Exception as e:
print(e)
except Exception as e:
# HANDLE ERRORS
print(e)
print("Completed running.")
# quit driver at the end of function run
driver.quit()
query isn't a list, but some other iterable type that you can only consume once (similar to a generator). In order to use it multiple times in the outer loop, you'll need to create a list to hold the contents in memory. For example,
# query for last article
query = refresher_ref.order_by(u'article_timestamp', direction=firestore.Query.DESCENDING).limit(10).stream()
query = list(query)
for item in news_items:
...
This application will read the mailbox data (mbox.txt) count up the number email messages per organization (i.e. domain name of the email address) using a database with the following schema to maintain the counts.
CREATE TABLE Counts (org TEXT, count INTEGER)
When you have run the program on mbox.txt upload the resulting database file above for grading.
If you run the program multiple times in testing or with different files, make sure to empty out the data before each run.
You can use this code as a starting point for your application: http://www.pythonlearn.com/code/emaildb.py. The data file for this application is the same as in previous assignments: http://www.pythonlearn.com/code/mbox.txt.
First time to learn Sqlite. I am very confused about this assignment although it seems to be easy. I don't know how can I connect Python codes to Sqlite. It seems that they don't need the code as assignment. All the need is database file. How should I solve this problem. Don't know how to start it. Much appreciated it!
The starting code you've been given is a really good template for what you want to do. The difference is that - in that example - you're counting occurences of email address, and in this problem you're counting domains.
First thing to do is think about how to get domain names from email addresses. Building from the code given (which sets email = pieces[1]):
domain = email.split('#')[1]
This will break the email on the # character, and return the second item (the part after the '#'), which is the domain - the thing you want to count.
After this, go through the SQL statements in the code and replace 'email' with 'domain', so that you're counting the right thing.
One last thing - the template code checks 'mbox-short.txt' - you'll need to edit that as well for the file you want.
import sqlite3
conn = sqlite3.connect('emaildb2.sqlite')
cur = conn.cursor()
cur.execute('''
DROP TABLE IF EXISTS Counts''')
cur.execute('''
CREATE TABLE Counts (org TEXT, count INTEGER)''')
fname = input('Enter file name: ')
if (len(fname) < 1): fname = 'mbox.txt'
fh = open(fname)
list_1 =[]
for line in fh:
if not line.startswith('From: '): continue
pieces = line.split()
email = pieces[1]
dom = email.find('#')
org = email[dom+1:len(email)]
cur.execute('SELECT count FROM Counts WHERE org = ? ', (org,))
row = cur.fetchone()
if row is None:
cur.execute('''INSERT INTO Counts (org, count)
VALUES (?, 1)''', (org,))
else:
cur.execute('UPDATE Counts SET count = count + 1 WHERE org = ?',
(org,))
conn.commit()
# https://www.sqlite.org/lang_select.html
sqlstr = 'SELECT org, count FROM Counts ORDER BY count DESC LIMIT 10'
for row in cur.execute(sqlstr):
print(str(row[0]), row[1])
cur.close()
I am still new here, but I want to thank Stidgeon for pointing me in the right direction. I suspect other Using Databases with Python students will end up here too.
There are two things you need to do with the source code.
domain = email.split('#')[1] http://www.pythonlearn.com/code/emaildb.py
Change from email TEXT to org TEXT when the database is generated.
That should get you on your way.
import sqlite3
conn = sqlite3.connect('emaildb.sqlite')
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS Counts')
cur.execute('''
CREATE TABLE Counts (org TEXT, count INTEGER)''')
fname = input('Enter file name: ')
if (len(fname) < 1): fname = 'mbox-short.txt'
fh = open(fname)
for line in fh:
if not line.startswith('From: '): continue
pieces = line.split()
org = pieces[1].split('#')
cur.execute('SELECT count FROM Counts WHERE org = ? ', (org[1],))
row = cur.fetchone()
if row is None:
cur.execute('''INSERT INTO Counts (org, count)
VALUES (?, 1)''', (org[1],))
else:
cur.execute('UPDATE Counts SET count = count + 1 WHERE org = ?',
(org[1],))
conn.commit()
# https://www.sqlite.org/lang_select.html
sqlstr = 'SELECT org, count FROM Counts ORDER BY count DESC LIMIT 10'
for row in cur.execute(sqlstr):
print(str(row[0]), row[1])
cur.close()
print('-----------------done----------------')
The simple python code opens a file, sample attached and extracts all the rows corresponding to all users in their respective. However, its not working properly. Note that the file is sorted according to USERS.
queries_file = file("queries_new_distinct_sorted.csv")
old_user = None
current_user = None
user_file = None
queries = "" # variable to capture all the rows as \n separated string
for line in queries_file:
query, user, timestamp, categories = line.strip().split(",")
# to extract component from the csv file
current_user = user
if old_user == None:
old_user = current_user
queries += timestamp +","+ categories+"\n"
elif old_user != current_user:
if len(queries.strip().split('\n')) > 3:
user_file = open("users/" + old_user,'w+')
user_file.write(queries)
user_file.close()
old_user = current_user
queries = timestamp +","+ categories+"\n"
else:
queries += timestamp +","+ categories+"\n"
queries_file.close()
File :
ipd,6086533,2006-05-12_19:45:23,shopping/vehicles/parts and accessories
tire rack,6086533,2006-05-13_22:29:44,shopping/vehicles/parts and accessories
volvo r 70 speed parts,6086533,2006-05-14_23:04:33,shopping/vehicles/autos
puerto rico,6086589,2006-03-07_21:39:36,travel and tourism/travel guides
espn spanish,6086589,2006-03-12_19:53:26,world/espaƱol/medios de comunicaciĆ³n
Found the problem.
I was not resetting values in case if length of queries was less than 2.