Parse HTML, write to file - python

I have question about parsing HTML tags with python.
My code looks like:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from lxml import html
import requests
import urllib2
import sys
import re
import time
import urllib
import datetime
def get_web():
try:
input_sat = open('rtc.xml','w')
godina = datetime.date.today().strftime("%Y")
print godina
mjesec = datetime.date.today().strftime("%m")
print mjesec
for x in range (32):
if x < 1:
x = x + 1
var = x
url = 'http://www.rts.rs/page/tv/sr/broadcast/20/RTS+1.html?month={}&year={}&day={}&type=0'.format(mjesec, godina, var)
page = requests.get(url)
tree = html.fromstring(page.text)
a = tree.xpath('//div[#id="center"]/h1/text()') # datum
b = tree.xpath('//div[#class="ProgramTime"]/text()') # time
c = tree.xpath('//div[#class="ProgramName"]/text()')
e = tree.xpath('//div[#class="ProgramName"]/a[#class="recnik"]/text()')
for line in zip(a,b,c,e):
var = line[0]
print >> input_sat, line+'\n'
except:
pass
get_web()
The script works fine and gets tags from a URL, but how can I write them into a file for processing?
When I run my code with a for loop, it doesn't work. I don't know where the problem is.
I rewrote my code, it won't output what's on the page to a file.

As I understand it, your print() function is incorrect. You have to use the write() function of the handler, and also encode the text to UTF-8:
for line in zip(a,b,c,e):
var = line[0]
input_sat.write(line[0].encode('utf-8') + '\n')
It yields:
Programska šema - sreda, 01. jan 2014

Related

Unable to retrieve Chinese texts while scraping

I have created a script that scrape website: 1688.com and the problem is, the site is in Chinese so whenever i try to retrieve the text, it gives me a bunch of unicode and when i export to a CSV file, there's nothing in the file.
My code:
# -*- coding: utf-8 -*-
import csv
from urllib import urlopen
from bs4 import BeautifulSoup as BS
csv_content = open('content.csv', 'w+')
writer_content = csv.writer(csv_content)
url = urlopen('https://fuzhuang.1688.com/nvzhuang?
spm=a260k.635.1998214976.1.7eqUGT')
html = BS(url, 'lxml')
container = html.find('ul', {'class' : 'ch-box fd-clr'})
offers = container.find_all('div', {'class' : 'ch-offer-body'})
lst = []
for offer in offers:
offer_box = offer.find('div', {'component-name' : '#alife/ocms-
component-1688-pc-ch-offer-pic'})
images = offer_box.find('img')['src']
title = offer.find('div', {'class' : 'ocms-component-1688-pc-ch-offer-
title-0-1-11'}).text
price = offer.find('div', {'class' : 'ocms-component-1688-pc-ch-offer-
price-0-1-14'}).text
lst.append(price)
for item in lst:
writer_content.writerow([item])
print lst
The output is
[u'\n\n\n\xa5\n109.00\n\n\n\u6210\u4ea4\n329\n\u4ef6\n\n\n', u'\n\n\n\xa5\n56.00\n\n\n\u6210\u4ea4\n195\n\u4ef6\n\n\n', u'\n\n\n\xa5\n83.00\n\n\n\u6210\u4ea4\n109\n\u4ef6\n\n\n', u'\n\n\n\xa5\n69.00\n\n\n\u6210\u4ea4\n208\n\u4ef6\n\n\n', u'\n\n\n\xa5\n46.00\n\n\n\u6210\u4ea4\n204\n\u4ef6\n\n\n', u'\n\n\n\xa5\n45.00\n\n\n\u6210\u4ea4\n54\n\u4ef6\n\n\n', u'\n\n\n\xa5\n82.00\n\n\n\u6210\u4ea4\n38\n\u4ef6\n\n\n', u'\n\n\n\xa5\n48.90\n\n\n\u6210\u4ea4\n318\n\u4ef6\n\n\n']
And i have already tried encoding and decoding utf-8, i would really appreciate it if you show me how to solve this problem.
this code will save chinese symbols to txt:
for Python3:
...
(all your code above)
for i in range(len(lst)):
lst[i]=lst[i].replace('\n','') #getting rig of `'\n'` newlines
writing to txt:
with open(r'C:\Users\Username\list.txt','w',newline='',encoding='utf-8-sig') as f:
for i in lst:
f.write(i+'\t')
for Python2:
import unicodecsv as ucsv
with open(r'C:\Users\Username\list1.txt','wb') as f:
w = ucsv.writer(f,encoding='utf-8-sig')
for i in lst:
w.writerow([i+'\t'])

Import Error: No Module named 'xxxx' Specific

Basically what I'm trying to do is make a program in python which takes a URL, copys the source, and pulls all comments out and presents them to the user.
import urllib2
import html2text
import PullsCommentsOut.pullscommentsout
url = raw_input('Please input URL with the text you want to analyze: ')
page = urllib2.urlopen(url)
html_content = page.read().decode('utf8')
rendered_content = html2text.html2text(html_content).encode('ascii',
'ignore')
f = open('file_text.txt', 'wb')
f.write(rendered_content)
f.close()
result = PullsCommentsOut.pullscommentsout(html_content)
print result
And my second file, 'PullsCommentsOut'
import re
def pullscommentsout():
def comment_remover(text):
def replacer(match):
s = match.group(0)
if s.startswith('/'):
print s
return " " # note: a space and not an empty string
else:
return s
pattern = re.compile(
r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
re.DOTALL | re.MULTILINE
)
return re.sub(pattern, replacer, text)
fd = open("test.c", "r")
buf = fd.read()
comment_remover(buf)
For the life of me I can't figure out why Python doesn't think I'm not importing the proper module? It doesn't make sense.
I need to add more text so it allows me to post, so, how are you all doing? I'm doing pretty good I guess. No complaints.

Saving to a file issue

i am trying to save the list that is generated to a file, i see the print out of the list fine but it will not write to the compoundlist.csv file. i am not sure what i am doing wrong, i have tried to write after the list is generated and also during the loop. I have gotten the same result.
import urllib
import urllib.request
from bs4 import BeautifulSoup
import os
import csv
def make_soup(url):
thepage = urllib.request.urlopen(url)
soupdata = BeautifulSoup(thepage, "html.parser")
return soupdata
compoundlist = []
soup = make_soup("http://www.genome.jp/dbget-bin/www_bget?ko00020")
i = 1
file = open("Compoundlist.csv", "wb")
for record in soup.findAll("nobr"):
compound = ''
if (record.text[0] == "C" and record.text[1] == '0') or (record.text[0] == "C" and record.text[1] == '1'):
compoundlist = "http://www.genome.jp/dbget-bin/www_bget?cpd:" + record.text
file.write(compoundlist)
print(compoundlist)
Try adding the following to the end of your code
file.close()
To flush the open file buffer into the file

Beautifulsoup Parser Truncating larger json

I'm trying to scrape some json data from a website. I'm using BeautifulSoup (bs4) as shown in the code below
import re
import csv
import json
import urllib2
from bs4 import BeautifulSoup as BS
city = 'Helsinki';
csvFile = csv.writer(open( city + ".csv", "wb+"))
csvFile.writerow(["tempid","latitude", "longitude"])
pageID = 0
locPage = urllib2.urlopen("http://runkeeper.com/user/maxspowers79/route/2481336")
soup = BS(locPage, "lxml").findAll('script',{"src":False})
print soup
pageID += 1
print pageID
for s in soup:
if 'routePoints' in s.string:
value = "[{" + s.string.split("}];")[0].split("[{")[1] + "}]"
#print value
jsonObj = json.loads(value)
for x in jsonObj:
csvFile.writerow([pageID,x["latitude"],x["longitude"]])
As an example, this is the runkeeper website with a random city and random route I've tested on. The code works fine for other similar pages but for longer routes like this (larger gps json if you view source in the browser).
The soup variable is truncated as you will see from the print command issued. Hence, the json is invalid and I cannot parse it.
I tried using a different parser (html5lib) as well but that was worse. Is there a limit to how big a string the soup variable can hold?
Otherwise why would it truncate?
How do I handle this?
I tested your code and it seems like - yes - BeautifulSoup has some limitations for tag content.
Consider using dumb and straightforward string manipulaion instead:
import re
import csv
import json
import urllib2
city = 'Helsinki';
csvFile = csv.writer(open( city + ".csv", "wb+"))
csvFile.writerow(["tempid","latitude", "longitude"])
pageID = 0
locPage = urllib2.urlopen("http://runkeeper.com/user/maxspowers79/route/2481336")
content = locPage.read()
start_at_s, end_at_s = 'var routePoints = ', 'mapController.initialize'
start_at_p = content.index(start_at_s) + len(start_at_s)
end_at_p = content.index(end_at_s)
raw_json = content[start_at_p:end_at_p].strip().strip(';')
jsonObj = json.loads(raw_json)
pageID += 1
print pageID
for x in jsonObj:
print x
csvFile.writerow([pageID,x["latitude"],x["longitude"]])
try rewriting your code with lxml. It should be more robust than beautifulsoup

Python's .strip() doesnt save outside of if statement?

I have scrapy pulling data from a web page. An issue Ive run across is it pulls alot of whitespace and Ive elected to use .strip() as suggested by others. Ive run into an issue though
if a.strip():
print a
if b.strip():
print b
Returns:
a1
b1
.
.
.
But this:
if a.strip():
aList.append(a)
if b.strip():
bList.append(b)
print aList, bList
Returns this:
a1
b1
Im trying to simulate the whitespace that I remove with .strip() here, but you get the point. For whatever reason it adds the whitespace to the list even though I told it not to. I can even print the list in the if statement and it also shows correctly, but for whatever reason, when I decide to print outside the if statements it doesnt work as I intended.
Here is my entire code:
# coding: utf-8
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.exporter import CsvItemExporter
import re
import csv
import urlparse
from stockscrape.items import EPSItem
from itertools import izip
class epsScrape(BaseSpider):
name = "eps"
allowed_domains = ["investors.com"]
ifile = open('test.txt', "r")
reader = csv.reader(ifile)
start_urls = []
for row in ifile:
url = row.replace("\n","")
if url == "symbol":
continue
else:
start_urls.append("http://research.investors.com/quotes/nyse-" + url + ".htm")
ifile.close()
def parse(self, response):
f = open("eps.txt", "a+")
sel = HtmlXPathSelector(response)
sites = sel.select("//div")
# items = []
for site in sites:
symbolList = []
epsList = []
item = EPSItem()
item['symbol'] = site.select("h2/span[contains(#id, 'qteSymb')]/text()").extract()
item['eps'] = site.select("table/tbody/tr/td[contains(#class, 'rating')]/span/text()").extract()
strSymb = str(item['symbol'])
newSymb = strSymb.replace("[]","").replace("[u'","").replace("']","")
strEps = str(item['eps'])
newEps = strEps.replace("[]","").replace(" ","").replace("[u'\\r\\n","").replace("']","")
if newSymb.strip():
symbolList.append(newSymb)
# print symbolList
if newEps.strip():
epsList.append(newEps)
# print epsList
print symbolList, epsList
for symb, eps in izip(symbolList, epsList):
f.write("%s\t%s\n", (symb, eps))
f.close()
strip does not modify the string in-place. It returns a new string with the whitespace stripped.
>>> a = ' foo '
>>> b = a.strip()
>>> a
' foo '
>>> b
'foo'
I figured out what it was that was causing the confusion. Its the location which I declared the variable/list. I was declaring it inside the for loop so everytime it iterated it rewrote and a blank list or variable is the same outcome of false for my if statement.

Categories

Resources