How to fetch xml data to write in a database - python

I'm working on a Python script to read the xml data from a server and storing the xml data in the database. When I create the database, I can see that it will write the list of xml in a database without fetching for each data and it did not create the database table which it looks like this: http://imageshack.com/a/img401/4210/ofa5.jpg
The xml i got from the server link: http://ontv.dk/xmltv/c81e728d9d4c2f636f067f89cc14862c
Here is the current code:
import xbmc
import xbmcgui
import xbmcaddon
import os
import urllib
import urllib2
import StringIO
import sqlite3
import datetime
import time
from xml.etree import ElementTree
ADDON = xbmcaddon.Addon(id = 'script.myaddon')
class MyScript(xbmcgui.WindowXML):
def __new__(cls):
return super(MyScript, cls).__new__(cls, 'script-menu.xml', ADDON.getAddonInfo('path'))
def onInit(self):
#DOWNLOAD THE XML SOURCE HERE
url = ADDON.getSetting('ontv.url')
req = urllib2.Request(url)
response = urllib2.urlopen(req)
data = response.read()
response.close()
profilePath = xbmc.translatePath(os.path.join('special://userdata/addon_data/script.tvguide', ''))
io = StringIO.StringIO(req)
context = ElementTree.iterparse(io)
if os.path.exists(profilePath):
profilePath = profilePath + 'source.db'
con = sqlite3.connect(profilePath)
cur = con.cursor()
cur.execute('CREATE TABLE programs(channel TEXT, title TEXT, start_date TIMESTAMP, end_date TIMESTAMP, description TEXT, image_large TEXT, image_small TEXT, source TEXT, updates_id INTEGER, FOREIGN KEY(channel, source) REFERENCES channels(id, source) ON DELETE CASCADE, FOREIGN KEY(updates_id) REFERENCES updates(id) ON DELETE CASCADE)')
cur.close()
fc = open(profilePath, 'w')
fc.write(data)
fc.close
I want to fetch for each xml data to write in a database after when I creating the database table. I want to know how do you write the source for xbmc to fetch for each xml data to store in a database after when I create the database table?

I haven't got the xbmc module installed, so this code is based on loading the XML from a file and then parsing through it.
I couldn't see any references to image_large, image_small or updates_id in the XML, so I left those commented out. There is probably a better way of doing this, but this should get you started and hopefully from here you should be able to work out how to loop through the list to write each programme to your database table.
import xml.etree.ElementTree as ET
tree = ET.parse('epg.xml')
root = tree.getroot()
programmes = []
for item in root.findall('programme'):
programme = {}
programme["channel"] = item.attrib['channel']
programme["title"] = item.find('title').text
programme["start_date"] = item.attrib['start']
programme["end_date"] = item.attrib['stop']
programme["description"] = item.find('desc').text
#programme["image_large"] =
#programme["image_small"] =
programme["source"] = item.find('icon').attrib['src']
#programme["updates_id"] =
programmes.append(programme)

Related

Use Python to scrape images from xml tags

I am trying to write a short python program to download of copy of the xml jail roster for the local county save that that file, scrape and save all the names and image links in a csv file, then download each of the photos with the file name being the name.
I've managed to get the XML file, save it locally, and create the csv file. I was briefly able to write the full xml tag (tag and attribute) to the csv file, but can't seem to get just the attribute, or the image links.
from datetime import datetime
from datetime import date
import requests
import csv
import bs4 as bs
from bs4 import BeautifulSoup
# get current date
today = date.today()
# convert date to date-sort format
d1 = today.strftime("%Y-%m-%d")
# create filename variable
roster = 'jailroster' + '-' + d1 + '-dev' + '.xml'
# grab xml file from server
url = "fakepath.xml"
print("ATTEMPTING TO GET XML FILE FROM SERVER")
req_xml = requests.get(url)
print("Response code:", req_xml.status_code)
if req_xml.status_code == 200:
print("XML file downloaded at ", datetime.now())
soup = BeautifulSoup(req_xml.content, 'lxml')
# save xml file from get locally
with open(roster, 'wb') as file:
file.write(req_xml.content)
print('Saving local copy of XML as:', roster)
# read xml data from saved copy
infile = open(roster,'r')
contents = infile.read()
soup = bs.BeautifulSoup(contents,'lxml')
# variables needed for image list
images = soup.findAll('image1')
fname = soup.findAll('nf')
mname = soup.findAll('nm')
lname = soup.findAll('nl')
baseurl = 'fakepath.com'
with open('image-list.csv', 'w', newline='') as csvfile:
imagelist = csv.writer(csvfile, delimiter=',')
print('Image list being created')
imagelist.writerows(images['src'])
I've gone through about a half dozen tutorials trying to figure all this out, but I think this is the edge of what I have been able to learn so far and I haven't even started to try and figure out how to save the list of images as files. Can anyone help out with a pointer or two or point me towards tutorials on this?
Update: No this is not for a mugshot site or any unethical purposes. This data is for a private data project for a non-public public safety project.
This should get you the data you need:
from datetime import date
import requests
from bs4 import BeautifulSoup
import pandas as pd
def extractor(tag: str) -> list:
return [i.getText() for i in soup.find_all(tag)]
url = "https://legacyweb.randolphcountync.gov/sheriff/jailroster.xml"
soup = BeautifulSoup(requests.get(url).text, features="lxml")
images = [
f"{'https://legacyweb.randolphcountync.gov'}{i['src'].lstrip('..')}"
for i in soup.find_all('image1')
]
df = pd.DataFrame(
zip(extractor("nf"), extractor("nm"), extractor("nl"), images),
columns=['First Name', 'Middle Name', 'Last Name', 'Mugshot'],
)
df.to_csv(
f"jailroster-{date.today().strftime('%Y-%m-%d')}-dev.csv",
index=False,
)
Sample output (a .csv file):

Python insert to database only one data from tuple

I have a problem. I need parse muliple xml files and insert data to database.
import os
from lxml import etree
import sqlite3
conn = sqlite3.connect("xml.db")
cursor = conn.cursor()
path = 'C:/tools/XML'
for filename in os.listdir(path):
fullname = os.path.join(path, filename)
tree = etree.parse(fullname)
test = tree.xpath('//*[#name="Name"]/text()')
tpl = tuple(test)
cursor.executemany("INSERT INTO parsee VALUES (?);", (tpl,))
conn.commit()
sql = "SELECT * FROM parsee"
cursor.execute(sql)
print(cursor.fetchall())
result:
[('testname1',)]
If I run the program again the program adds another same name. Result:
[('testname1',),('testname1',)]
There are 100 files in folder:
<curent name="Name">testname1<curent>
<curent name="Name">testname2<curent>
<curent name="Name">testname3<curent>
<curent name="Name">testname4<curent>
Since I don't have admin rights to install lxml on my computer, I will use a battery (class) that is included in Python by default to deal with XPATHs - xml.etree.ElementTree. However, my code will show you how to insert multiple records in SQLITE using executemany()
It looks like in C:/tools/XML you will have many xml files of same structure.
I put following two in a folder to simulate this (I noted that your example has 'curent' as element, not sure if it is a typo, I am using 'current')
file1.xml
<note>
<current name="Name">testname1</current>
<current name="Name">testname2</current>
<otherdetail></otherdetail>
</note>
file2.xml
<note>
<current name="Name">testname3</current>
<current name="Name">testname4</current>
<otherdetail></otherdetail>
</note>
Created an sqlite database case called xml.db and a table in it with following statement
CREATE TABLE PARSEE (NAME VARCHAR(100));
And here is my python script
import os
import xml.etree.ElementTree as ET
import sqlite3
conn = sqlite3.connect("xml.db")
cursor = conn.cursor()
path = 'C:/tools/XML'
for filename in os.listdir(path):
fullname = os.path.join(path, filename)
print("Parsing file: %s" %fullname)
tree = ET.parse(fullname)
root = tree.getroot()
elements = root.findall(".//*[#name='Name']");
names = [(e.text,) for e in elements]
print("Names found: %s" %names)
cursor.executemany("INSERT INTO PARSEE VALUES (?)", names)
conn.commit()
sql = "SELECT * FROM PARSEE"
print("Printing table PARSEE content")
cursor.execute(sql)
print(cursor.fetchall())
And here is the output

using supported site for video processing

I am trying to change my code to support video processing from multiple sites (youtube, vimeo, etc.) using the youtube extractions. I don't want to import youtube-dl (unless necessary). I would prefer to call a function. my understanding is that this: youtube-dl http://vimeo.com/channels/YOUR-CHANNEL) is a command line tool. please help!
import pymongo
import get_media
import configparser as ConfigParser
# shorten list to first 10 items
def shorten_list(mylist):
return mylist[:10]
def main():
config = ConfigParser.ConfigParser()
config.read('settings.cfg')
youtubedl_filename = config.get('media', 'youtubedl_input')
print('creating file: %s - to be used as input for youtubedl' % youtubedl_filename)
db = get_media.connect_to_media_db()
items = db.raw
url_list = []
cursor = items.find()
records = dict((record['_id'], record) for record in cursor)
# iterate through records in media items collection
# if 'Url' field exists and starts with youtube, add url to list
for item in records:
item_dict = records[item]
#print(item_dict)
if 'Url' in item_dict['Data']:
url = item_dict['Data']['Url']
if url.startswith('https://www.youtube.com/'):
url_list.append(url)
# for testing purposes
# shorten list to only download a few files at a time
url_list = shorten_list(url_list)
# save list of youtube media file urls
with open(youtubedl_filename, 'w') as f:
for url in url_list:
f.write(url+'\n')
if __name__ == "__main__":
main()

Getting wrong characters in pt-br from xml in python

I'm trying to send data from a XML feed to MySQL database, but I'm getting wrong pt-br characters in python and mysql.
import MySQLdb
import urllib2
import sys
import codecs
## default enconding
reload(sys)
sys.setdefaultencoding('utf-8')
UTF8Writer = codecs.getwriter('utf8')
sys.stdout = UTF8Writer(sys.stdout)
file = urllib2.urlopen('feed.xml')
data = file.read()
file.close()
data = xmltodict.parse(data)
db = MySQLdb.connect(host=MYSQL_HOST, # your host, usually localhost
user=MYSQL_USER, # your username
passwd=MYSQL_PASSWD, # your password
db=MYSQL_DB) # name of the data base
cur = db.cursor()
product_name = str(data.items()[0][1].items()[2][1].items()[3][1][i].items()[1][1])
But when I print product_name in Python or insert it into mysql, I get this:
'Probi\xc3\xb3tica (120caps)'
this should be:
'Probiótica'
How can I fix this?
'Probi\xc3\xb3tica' is the utf-8 encoded version of 'Probiótica'.
Is your terminal (or whatever you are using to run this) set up to handle utf-8 output?
Try print 'Probi\xc3\xb3tica'.decode('utf-8') to see what happens.
I get Probiótica.

Internet History Script For Google Chrome

I'm not looking for a "best" or most efficient script to do this. But I was wondering if there exists a script to pull Internet History for a day's time from, say, Google Chrome and log it to a txt file. I'd prefer if it were in Python or MATLAB.
If you guys have a different method using one of these languages utilizing locally stored browser history data from Google Chrome, I'd be all ears for that too.
I'd be super-thankful if anyone could help with this!
From my understanding, it seems easy to be done. I don't know if this is what you want.
Internet history from Chrome is stored at a specific path. Take Win7 for example, it's stored at win7: C:\Users\[username]\AppData\Local\Google\Chrome\User Data\Default\History
In Python:
f = open('C:\Users\[username]\AppData\Local\Google\Chrome\User Data\Default\History', 'rb')
data = f.read()
f.close()
f = open('your_expected_file_path', 'w')
f.write(repr(data))
f.close()
Building on what m170897017 said:
That file is an sqlite3 database, so taking repr() of its contents won't do anything meaningful.
You need to open the sqlite database and run SQL against it to get the data out. In python use the sqlite3 library in the stdlib to do this.
Here's a related SuperUser question that shows some SQL for getting URLs and timestamps: https://superuser.com/a/694283
Dodged sqlite3/sqlite, I'm using the Google Chrome extension "Export History", exporting everything into a CSV file, and subsequently loading that CSV file into cells within MATLAB.
Export History
My code turned out to be:
file_o = ['history.csv'];
fid = fopen(file_o, 'rt');
fmt = [repmat('%s', 1, 6) '%*[^\n]'];
C = textscan(fid,fmt,'Delimiter',',','CollectOutput',true);
C_unpacked = C{:};
C_urls = C_unpacked(1:4199, 5);
Here's another one:
import csv, sqlite3, os
from datetime import datetime, timedelta
connection = sqlite3.connect(os.getenv("APPDATA") + "\..\Local\Google\Chrome\User Data\Default\history")
connection.text_factory = str
cur = connection.cursor()
output_file = open('chrome_history.csv', 'wb')
csv_writer = csv.writer(output_file)
headers = ('URL', 'Title', 'Visit Count', 'Date (GMT)')
csv_writer.writerow(headers)
epoch = datetime(1601, 1, 1)
for row in (cur.execute('select url, title, visit_count, last_visit_time from urls')):
row = list(row)
url_time = epoch + timedelta(microseconds=row[3])
row[3] = url_time
csv_writer.writerow(row)
This isn't exactly what you are looking for. However, by using this you can manipulate the database tables to your liking
import os
import sqlite3
def Find_path():
User_profile = os.environ.get("USERPROFILE")
History_path = User_profile + r"\\AppData\Local\Google\Chrome\User Data\Default\History" #Usually this is where the chrome history file is located, change it if you need to.
return History_path
def Main():
data_base = Find_path()
con = sqlite3.connect(data_base) #Connect to the database
c = con.cursor()
c.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") #Change this to your prefered query
print(c.fetchall())
if __name__ == '__main__':
Main()

Categories

Resources