I am using scrapy to scrap blogs and then store the data in mongodb. At first i got the InvalidDocument Exception. So obvious to me is that the data is not in the right encoding. So before persisting the object, in my MongoPipeline i check if the document is in 'utf-8 strict', and only then i try to persist the object to mongodb. BUT Still i get InvalidDocument Exceptions, now that is annoying.
This is my code my MongoPipeline Object that persists objects to mongodb
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
import pymongo
import sys, traceback
from scrapy.exceptions import DropItem
from crawler.items import BlogItem, CommentItem
class MongoPipeline(object):
collection_name = 'master'
def __init__(self, mongo_uri, mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#classmethod
def from_crawler(cls, crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE', 'posts')
)
def open_spider(self, spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self, spider):
self.client.close()
def process_item(self, item, spider):
if type(item) is BlogItem:
try:
if 'url' in item:
item['url'] = item['url'].encode('utf-8', 'strict')
if 'domain' in item:
item['domain'] = item['domain'].encode('utf-8', 'strict')
if 'title' in item:
item['title'] = item['title'].encode('utf-8', 'strict')
if 'date' in item:
item['date'] = item['date'].encode('utf-8', 'strict')
if 'content' in item:
item['content'] = item['content'].encode('utf-8', 'strict')
if 'author' in item:
item['author'] = item['author'].encode('utf-8', 'strict')
except: # catch *all* exceptions
e = sys.exc_info()[0]
spider.logger.critical("ERROR ENCODING %s", e)
traceback.print_exc(file=sys.stdout)
raise DropItem("Error encoding BLOG %s" % item['url'])
if 'comments' in item:
comments = item['comments']
item['comments'] = []
try:
for comment in comments:
if 'date' in comment:
comment['date'] = comment['date'].encode('utf-8', 'strict')
if 'author' in comment:
comment['author'] = comment['author'].encode('utf-8', 'strict')
if 'content' in comment:
comment['content'] = comment['content'].encode('utf-8', 'strict')
item['comments'].append(comment)
except: # catch *all* exceptions
e = sys.exc_info()[0]
spider.logger.critical("ERROR ENCODING COMMENT %s", e)
traceback.print_exc(file=sys.stdout)
self.db[self.collection_name].insert(dict(item))
return item
And still i get the following exception:
au coeur de l\u2019explosion de la bulle Internet n\u2019est probablement pas \xe9tranger au succ\xe8s qui a suivi. Mais franchement, c\u2019est un peu court comme argument !Ce que je sais dire, compte tenu de ce qui pr\xe9c\xe8de, c\u2019est quelles sont les conditions pour r\xe9ussir si l\u2019on est vraiment contraint de rester en France. Ce sont des sujets que je d\xe9velopperai dans un autre article.',
'date': u'2012-06-27T23:21:25+00:00',
'domain': 'reussir-sa-boite.fr',
'title': u'Peut-on encore entreprendre en France ?\t\t\t ',
'url': 'http://www.reussir-sa-boite.fr/peut-on-encore-entreprendre-en-france/'}
Traceback (most recent call last):
File "h:\program files\anaconda\lib\site-packages\twisted\internet\defer.py", line 588, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "H:\PDS\BNP\crawler\crawler\pipelines.py", line 76, in process_item
self.db[self.collection_name].insert(dict(item))
File "h:\program files\anaconda\lib\site-packages\pymongo\collection.py", line 409, in insert
gen(), check_keys, self.uuid_subtype, client)
InvalidDocument: Cannot encode object: {'author': 'Arnaud Lemasson',
'content': 'Tellement vrai\xe2\x80\xa6 Il faut vraiment \xc3\xaatre motiv\xc3\xa9 aujourd\xe2\x80\x99hui pour monter sa bo\xc3\xaete. On est pr\xc3\xa9lev\xc3\xa9 de partout, je ne pense m\xc3\xaame pas \xc3\xa0 embaucher, cela me co\xc3\xbbterait bien trop cher. Bref, 100% d\xe2\x80\x99accord avec vous. Le probl\xc3\xa8me, je ne vois pas comment cela pourrait changer avec le gouvernement actuel\xe2\x80\xa6 A moins que si, j\xe2\x80\x99ai pu lire il me semble qu\xe2\x80\x99ils avaient en t\xc3\xaate de r\xc3\xa9duire l\xe2\x80\x99IS pour les petites entreprises et de l\xe2\x80\x99augmenter pour les grandes\xe2\x80\xa6 A voir',
'date': '2012-06-27T23:21:25+00:00'}
2015-11-04 15:29:15 [scrapy] INFO: Closing spider (finished)
2015-11-04 15:29:15 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 259,
'downloader/request_count': 1,
'downloader/request_method_count/GET': 1,
'downloader/response_bytes': 252396,
'downloader/response_count': 1,
'downloader/response_status_count/200': 1,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2015, 11, 4, 14, 29, 15, 701000),
'log_count/DEBUG': 2,
'log_count/ERROR': 1,
'log_count/INFO': 7,
'response_received_count': 1,
'scheduler/dequeued': 1,
'scheduler/dequeued/memory': 1,
'scheduler/enqueued': 1,
'scheduler/enqueued/memory': 1,
'start)
time': datetime.datetime(2015, 11, 4, 14, 29, 13, 191000)}
Another funny thing from the comment of #eLRuLL i did the following:
>>> s = "Tellement vrai\xe2\x80\xa6 Il faut vraiment \xc3\xaatre motiv\xc3\xa9 aujourd\xe2\x80\x99hui pour monter sa bo\xc3\xaete. On est pr\xc3\xa9lev\xc3\xa9 de partout, je ne pense m\xc3\xaame pas \xc3\xa0 embaucher, cela me"
>>> s
'Tellement vrai\xe2\x80\xa6 Il faut vraiment \xc3\xaatre motiv\xc3\xa9 aujourd\xe2\x80\x99hui pour monter sa bo\xc3\xaete. On est pr\xc3\xa9lev\xc3\xa9 de partout, je ne pense m\xc3\xaame pas \xc3\xa0 embaucher, cela me'
>>> se = s.encode("utf8", "strict")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128)
>>> se = s.encode("utf-8", "strict")
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128)
>>> s.decode()
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 14: ordinal not in range(128)
Then my question is. If this text cannot be encoded. Then why, is my MongoPipeline try catch not catching this EXCEPTION? Because only objects that don't raise any exception should be appended to item['comments'] ?
Finally I figured it out. The problem was not with encoding. It was with the structure of the documents.
Because i went off on the standard MongoPipeline example which does not deal with nested scrapy items.
What i am doing is:
BlogItem:
"url"
...
comments = [CommentItem]
So my BlogItem has a list of CommentItems. Now the problem came here, for persisting the object in the database i do:
self.db[self.collection_name].insert(dict(item))
So here i am parsing the BlogItem to a dict. But i am not parsing the list of CommentItems. And because the traceback displays the CommentItem kind of like a dict, It did not occur to me that the problematic object is not a dict!
So finally the the way to fix this problem is to change the line when appending the comment to the comment list as such:
item['comments'].append(dict(comment))
Now MongoDB considers it as a valid document.
Lastly, for the last part where i ask why i am getting a exception on the python console and not in the script.
The reason is because i was working on the python console, which only supports ascii. And thus the error.
I got this error when running a query
db.collection.find({'attr': {'$gte': 20}})
and some records in collection had a non-numeric value for attr.
First, when you do "somestring".encode(...), isn't changing "somestring", but it returns a new encoded string, so you should use something like:
item['author'] = item['author'].encode('utf-8', 'strict')
and the same for the other fields.
I ran into the same error using a numpy array in a Mongo query :
'myField' : { '$in': myList },
The fix was simply to convert the nd.array() into a list :
'myField' : { '$in': list(myList) },
in my case it was super stupid yet not easy to notice:
I accidentally wrote
f"indexes_access.{jsonData['index']}: {jsonData['newState']}"
instead of
{f"indexes_access.{jsonData['index']}": f"{jsonData['newState']}"}
(one long string parsed with f strings instead of key and value parsed separately)
Related
I am iterating over an XML file to insert some of its attributes inside a JSON to develop a Corpus. For some reason, when inserting the date and the body of the XML it always inserts the same text inside the JSON line
The XML file format (I'm trying to get the title, timestamp and text from all the pages):
<page>
<title>MediaWiki:Monobook.js</title>
<ns>8</ns>
<id>4</id>
<revision>
<id>45582</id>
<parentid>45581</parentid>
<timestamp>2007-09-20T03:20:46Z</timestamp>
<contributor>
<username>Superzerocool</username>
<id>404</id>
</contributor>
<comment>Limpiando</comment>
<model>javascript</model>
<format>text/javascript</format>
<text bytes="34" xml:space="preserve">/* Utilizar MediaWiki:Common.js */</text>
<sha1>5dy7xlkatqg5epjzrq48br6yeh5uu34</sha1>
</revision>
</page>
My code:
if __name__ == "__main__":
path = 'local path'
tree = ET.parse(path)
root = tree.getroot()
with open("C:/Users/User/Documents/Github/News-Corpus/corpus.json", "w") as f:
for page in tqdm(root.findall('page')):
title = page.find('title').text
dictionary = {}
dictionary["title"] = title
for revision in root.iter('revision'):
timestamp = revision.find('timestamp').text
dictionary["timestamp"] = timestamp
body = revision.find('text').text
dictionary["body"] = body
f.write(json.dumps(dictionary))
f.write("\n")
The output I get:
{"title": "MediaWiki:Monobook.js", "timestamp": "2022-09-16T13:07:15Z", "body": "Los pasajeros les fue devuelta el importe de su billete de vuelo perdida?"}
{"title": "MediaWiki:Administrators", "timestamp": "2022-09-16T13:07:15Z", "body": "Los pasajeros les fue devuelta el importe de su billete de vuelo perdida?"}
{"title": "MediaWiki:Allmessages", "timestamp": "2022-09-16T13:07:15Z", "body": "Los pasajeros les fue devuelta el importe de su billete de vuelo perdida?"}
{"title": "MediaWiki:Allmessagestext", "timestamp": "2022-09-16T13:07:15Z", "body": "Los pasajeros les fue devuelta el importe de su billete de vuelo perdida?"}
{"title": "MediaWiki:Allpagessubmit", "timestamp": "2022-09-16T13:07:15Z", "body": "Los pasajeros les fue devuelta el importe de su billete de vuelo perdida?"}
As you can see, I always get the same timestamp and the same body, does anyone know why this happens? Help is much appreciated.
I tried with \\n, but it didn't worked, so I think with another reserved word could work, but I can not find the good one.
table_config = [
{
'dbName': f'gomez_datalake_{env}_{team}_{dataset}_db',
'table': 'ConFac',
'partitionKey': 'DL_PERIODO',
'schema': [
['TIPO_DE_VALOR', 'STRING', 2, None,
"CÓDIGO DEL PARÁMETRO DE SISTEMA."
"EJEMPLOS:"
"UF: VALOR DE LA UF"
"IP: VALOR DEL IPC"
"MO: MONEDA"
"IV: VALOR DEL VA"
"UT: VALOR DEL UTM"],
['ORIGEN', 'STRING', 4, None, "IDENTIFICADOR DE USUARIO"]
]
With
stream = open(afile, 'r')
self.meta = yaml.load(stream)
you can easyly read YAML file in python, but it has not got --- at the end I reach error (same with ...):
yaml.composer.ComposerError: expected a single document in the stream
in "El-punt-de-llibre.md", line 2, column 1
but found another document
in "El-punt-de-llibre.md", line 6, column 1
But YAML specs allow that:
YAML uses three dashes (“---”) to separate directives from document content. This also serves to signal the start of a document if no directives are present. Three dots ( “...”) indicate the end of a document without starting a new one, for use in communication channels.
So, how do you read this
---
title: "El punt de llibre"
abstract: "Estimar a quina pàgina està el punt de llibre"
keywords: ["when", "activitat", "3/3", "grup", "estimació", "aproximació", "funció lineal - proporcionalitat", "ca"]
comments: true
...
in python?
Your YAML stream/file appears to have more than document in it, for example trying to parse this would give the same error message:
---
title: "El punt de llibre"
abstract: "Estimar a quina pàgina està el punt de llibre"
keywords: ["when", "activitat", "3/3", "grup", "estimació", "aproximació", "funció lineal - proporcionalitat", "ca"]
comments: true
...
---
title: "El punt de llibre"
abstract: "Estimar a quina pàgina està el punt de llibre"
keywords: ["when", "activitat", "3/3", "grup", "estimació", "aproximació", "funció lineal - proporcionalitat", "ca"]
comments: true
...
---
title: "El punt de llibre"
abstract: "Estimar a quina pàgina està el punt de llibre"
keywords: ["when", "activitat", "3/3", "grup", "estimació", "aproximació", "funció lineal - proporcionalitat", "ca"]
comments: true
...
To process such a stream you could use the following approach:
import yaml
with open('test.yaml') as f_yaml:
for doc in yaml.safe_load_all(f_yaml):
print doc
Which would show you the following:
{'keywords': ['when', 'activitat', '3/3', 'grup', u'estimaci\xf3', u'aproximaci\xf3', u'funci\xf3 lineal - proporcionalitat', 'ca'], 'abstract': u'Estimar a quina p\xe0gina est\xe0 el punt de llibre', 'comments': True, 'title': 'El punt de llibre'}
{'keywords': ['when', 'activitat', '3/3', 'grup', u'estimaci\xf3', u'aproximaci\xf3', u'funci\xf3 lineal - proporcionalitat', 'ca'], 'abstract': u'Estimar a quina p\xe0gina est\xe0 el punt de llibre', 'comments': True, 'title': 'El punt de llibre'}
{'keywords': ['when', 'activitat', '3/3', 'grup', u'estimaci\xf3', u'aproximaci\xf3', u'funci\xf3 lineal - proporcionalitat', 'ca'], 'abstract': u'Estimar a quina p\xe0gina est\xe0 el punt de llibre', 'comments': True, 'title': 'El punt de llibre'}
If your YAML source contains more than one document, you can get the first document with
list(yaml.safe_load_all(stream))[0]
However, it seems strange that a ... causes PyYaml to break and you may want to report that as bug.
Use ruamel.yaml file to handle a YAML file with comments and spaces
import ruamel.yaml
yaml = ruamel.yaml.YAML()
with open(yaml_file) as f:
for doc in yaml.load_all(f):
print(doc)
I am trying to populate a postgresql database with initial values using fixtures in django. I keep getting these weird Could not load publication.Article(pk=None): value too long for type character varying(100)
errors even though my model looks like this:
class Article(models.Model):
_id = models.CharField(max_length=1000)
author_name = models.CharField(max_length=1000)
caption = models.CharField(max_length=1000)
isGraphic = models.BooleanField(max_length=1000, default=True)
pictures = models.URLField(max_length=1000)
text = models.CharField(max_length=10000)
title = models.CharField(max_length=1000)
user_img = models.URLField(max_length=1000)
videoname = models.CharField(max_length=1000)
vimeo_id = models.IntegerField(max_length=1000)
Traceback (most recent call last):
File "manage.py", line 10, in <module>
execute_from_command_line(sys.argv)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/management/__init__.py", line 385, in execute_from_command_line
utility.execute()
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/management/__init__.py", line 377, in execute
self.fetch_command(subcommand).run_from_argv(self.argv)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/management/base.py", line 288, in run_from_argv
self.execute(*args, **options.__dict__)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/management/base.py", line 338, in execute
output = self.handle(*args, **options)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/management/commands/loaddata.py", line 61, in handle
self.loaddata(fixture_labels)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/management/commands/loaddata.py", line 91, in loaddata
self.load_label(fixture_label)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/management/commands/loaddata.py", line 148, in load_label
obj.save(using=self.using)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/core/serializers/base.py", line 173, in save
models.Model.save_base(self.object, using=using, raw=True)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/models/base.py", line 617, in save_base
updated = self._save_table(raw, cls, force_insert, force_update, using, update_fields)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/models/base.py", line 698, in _save_table
result = self._do_insert(cls._base_manager, using, fields, update_pk, raw)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/models/base.py", line 731, in _do_insert
using=using, raw=raw)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/models/manager.py", line 92, in manager_method
return getattr(self.get_queryset(), name)(*args, **kwargs)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/models/query.py", line 921, in _insert
return query.get_compiler(using=using).execute_sql(return_id)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/models/sql/compiler.py", line 920, in execute_sql
cursor.execute(sql, params)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/backends/utils.py", line 81, in execute
return super(CursorDebugWrapper, self).execute(sql, params)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/backends/utils.py", line 65, in execute
return self.cursor.execute(sql, params)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/utils.py", line 94, in __exit__
six.reraise(dj_exc_type, dj_exc_value, traceback)
File "/Users/sam.royston/PycharmProjects/sahelien_d/lib/python2.7/site-packages/django/db/backends/utils.py", line 65, in execute
return self.cursor.execute(sql, params)
django.db.utils.DataError: Problem installing fixture '/Users/sam.royston/PycharmProjects/sahelien_d/sahelien_django/fixtures/test.json' : Could not load publication.Article(pk=None): value too long for type character varying(100)
why am I getting this error?
test.json:
[
{ "model" : "publication.Article" , "fields":
{
"_id" : "5306dfa9ed2379f03a000001" ,
"author_name" : "Sahélien Tombouctou",
"caption" : "Les n’ont fait aucune victime, ni de dégâts matériels",
"isGraphic" : false,
"pictures" : [],
"text" : "La ville de Tombouctou a reçu des tirs d'obus dans la nuit de dimanche. \n<br>\n<br>\nLes deux premiers obus sont tombés dans la localité de Kabara, à 10km de la cité des 333 saints. Le troisième obus est tombé sur la route de Goundam.\n<br>\n<br>\nLes tirs n’ont fait aucune victime, ni de dégâts matériels. Selon le lieutenant-colonel Seydou Koné, en poste à Tombouctou, l'armée malienne est mobilisée pour déterminer l'origine de cette attaque.",
"title" : "Tombouctou attaquée à la roquette",
"videoname" : "okok.mp4",
"vimeo_id" : "87246621"
}
}
]
Your json fixture is missing a primary key. Django automatically adds a primary key to your models; called id. As this key is required, you should provide it in fixtures.
The fixture you have posted is does not have this key, you should add it:
[
{ "model" : "publication.Article" , "fields":
{
"id": "1",
"_id" : "5306dfa9ed2379f03a000001" ,
"author_name" : "Sahélien Tombouctou",
"caption" : "Les n’ont fait aucune victime, ni de dégâts matériels",
"isGraphic" : false,
"pictures" : [],
"text" : "La ville de Tombouctou a reçu des tirs d'obus dans la nuit de dimanche. \n<br>\n<br>\nLes deux premiers obus sont tombés dans la localité de Kabara, à 10km de la cité des 333 saints. Le troisième obus est tombé sur la route de Goundam.\n<br>\n<br>\nLes tirs n’ont fait aucune victime, ni de dégâts matériels. Selon le lieutenant-colonel Seydou Koné, en poste à Tombouctou, l'armée malienne est mobilisée pour déterminer l'origine de cette attaque.",
"title" : "Tombouctou attaquée à la roquette",
"videoname" : "okok.mp4",
"vimeo_id" : "87246621"
}
}
]
You are missing key fields that are required in your model from your fixture. You need to add user_img and pictures cannot be empty.
The fixture needs to pass all the validation rules of your model; and since all fields are required as per your model, they all need to be available in the fixture.
In addition, you have a max_length argument for integer, boolean and url fields which are not applicable.
I have two questions:
1) What I have done wrong in the script below? The result in not encoded propertly and all non standard characters are stored incorrectly. When I print out data list it gives me a proper list of unicode types:
[u'Est-ce que tu peux traduire \xc3\xa7a pour moi? \n \n \n Can you translate this for me?'], [u'Chicago est tr\xc3\xa8s diff\xc3\xa9rente de Boston. \n \n \n Chicago is very different from Boston.'],
After that I strip all extra spaces and next lines and result in file is like this (looks same when print and save to file):
Est-ce que tu peux traduire ça pour moi?;Can you translate this for me?
Chicago est très différente de Boston.;Chicago is very different from Boston.
2) What other than Python scripting langage would you recommend?
import requests
import unicodecsv, os
from bs4 import BeautifulSoup
import re
import html5lib
countries = ["fr"] #,"id","bn","my","chin","de","es","fr","hi","ja","ko","pt","ru","th","vi","zh"]
for country in countries:
f = open("phrase_" + country + ".txt","w")
w = unicodecsv.writer(f, encoding='utf-8')
toi = 1
print country
while toi<2:
url = "http://www.englishspeak.com/"+ country +"/english-phrases.cfm?newCategoryShowed=" + str(toi) + "&sortBy=28"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html5lib')
soup.unicode
[s.extract() for s in soup('script')]
[s.extract() for s in soup('style')]
[s.extract() for s in soup('head')]
[s.extract() for s in soup("table" , { "height" : "102" })]
[s.extract() for s in soup("td", { "class" : "copyLarge"})]
[s.extract() for s in soup("td", { "width" : "21%"})]
[s.extract() for s in soup("td", { "colspan" : "3"})]
[s.extract() for s in soup("td", { "width" : "25%"})]
[s.extract() for s in soup("td", { "class" : "blacktext"})]
[s.extract() for s in soup("div", { "align" : "center"})]
data = []
rows = soup.find_all('tr', {"class": re.compile("Data.")})
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
wordsList = []
for index, item in enumerate(data):
str_tmp = "".join(data[index]).encode('utf-8')
str_tmp = re.sub(r' +\n\s+', ';', str_tmp)
str_tmp = re.sub(r' +', ' ', str_tmp)
wordsList.append(str_tmp.decode('utf-8'))
print str_tmp
w.writerow(wordsList)
toi += 1
You should use r.text not r.content because content are the bytes and text is the decoded text:
soup = BeautifulSoup(r.text, 'html5lib')
You can just write utf-8 encoded to file:
with open("out.txt","w") as f:
for d in data:
d = " ".join(d).encode("utf-8")
d = re.sub(r'\n\s+', ';', d)
d = re.sub(r' +', ' ', d)
f.write(d)
Output:
Fais attention en conduisant. ;Be careful driving.Fais attention. ;Be careful.Est-ce que tu peux traduire ça pour moi? ;Can you translate this for me?Chicago est très différente de Boston. ;Chicago is very different from Boston.Ne t'inquiète pas. ;Don't worry.Tout le monde le sais. ;Everyone knows it.Tout est prêt. ;Everything is ready.Excellent. ;Excellent.De temps en temps. ;From time to time.Bonne idée. ;Good idea.Il l'aime beaucoup. ;He likes it very much.A l'aide! ;Help!Il arrive bientôt. ;He's coming soon.Il a raison. ;He's right.Il est très ennuyeux. ;He's very annoying.Il est très célèbre. ;He's very famous.Comment ça va? ;How are you?Comment va le travail? ;How's work going?Dépêche-toi! ;Hurry!J'ai déjà mangé. ;I ate already.Je ne vous entends pas. ;I can't hear you.Je ne sais pas m'en servir. ;I don't know how to use it.Je ne l'aime pas. ;I don't like him.Je ne l'aime pas. ;I don't like it.Je ne parle pas très bien. ;I don't speak very well.Je ne comprends pas. ;I don't understand.Je n'en veux pas. ;I don't want it.Je ne veux pas ça. ;I don't want that.Je ne veux pas te déranger. ;I don't want to bother you.Je me sens bien. ;I feel good.Je sors du travail à six heures. ;I get off of work at 6.J'ai mal à la tête. ;I have a headache.J'espère que votre femme et vous ferez un bon voyage. ;I hope you and your wife have a nice trip.Je sais. ;I know.Je l'aime. ;I like her.J'ai perdu ma montre. ;I lost my watch.Je t'aime. ;I love you.J'ai besoin de changer de vêtements. ;I need to change clothes.J'ai besoin d'aller chez moi. ;I need to go home.Je veux seulement un en-cas. ;I only want a snack.Je pense que c'est bon. ;I think it tastes good.Je pense que c'est très bon. ;I think it's very good.Je pensais que les vêtements étaient plus chers. ;I thought the clothes were cheaper.J'allais quitter le restaurant quand mes amis sont arrivés. ;I was about to leave the restaurant when my friends arrived.Je voudrais faire une promenade. ;I'd like to go for a walk.Si vous avez besoin de mon aide, faites-le-moi savoir s'il vous plaît. ;If you need my help, please let me know.Je t'appellerai vendredi. ;I'll call you when I leave.Je reviendrai plus tard. ;I'll come back later.Je paierai. ;I'll pay.Je vais le prendre. ;I'll take it.Je t'emmenerai à l'arrêt de bus. ;I'll take you to the bus stop.Je suis un Américain. ;I'm an American.Je nettoie ma chambre. ;I'm cleaning my room.J'ai froid. ;I'm cold.Je viens te chercher. ;I'm coming to pick you up.Je vais partir. ;I'm going to leave.Je vais bien, et toi? ;I'm good, and you?Je suis content. ;I'm happy.J'ai faim. ;I'm hungry.Je suis marié. ;I'm married.Je ne suis pas occupé. ;I'm not busy.Je ne suis pas marié. ;I'm not married.Je ne suis pas encore prêt. ;I'm not ready yet.Je ne suis pas sûr. ;I'm not sure.Je suis désolé, nous sommes complets. ;I'm sorry, we're sold out.J'ai soif. ;I'm thirsty.Je suis très occupé. Je n'ai pas le temps maintenant. ;I'm very busy. I don't have time now.Est-ce que Monsieur Smith est un Américain? ;Is Mr. Smith an American?Est-ce que ça suffit? ;Is that enough?C'est plus long que deux kilomètres. ;It's longer than 2 miles.Je suis ici depuis deux jours. ;I've been here for two days.J'ai entendu dire que le Texas était beau comme endroit. ;I've heard Texas is a beautiful place.Je n'ai jamais vu ça avant. ;I've never seen that before.Juste un peu. ;Just a little.Juste un moment. ;Just a moment.Laisse-moi vérifier. ;Let me check.laisse-moi y réfléchir. ;Let me think about it.Allons voir. ;Let's go have a look.Pratiquons l'anglais. ;Let's practice English.Pourrais-je parler à madame Smith s'il vous plaît? ;May I speak to Mrs. Smith please?Plus que ça. ;More than that.Peu importe. ;Never mind.La prochaine fois. ;Next time.Non, merci. ;No, thank you.Non. ;No.N'importe quoi. ;Nonsense.Pas récemment. ;Not recently.Pas encore. ;Not yet.Rien d'autre. ;Nothing else.Bien sûr. ;Of course.D'accord. ;Okay.S'il vous plaît remplissez ce formulaire. ;Please fill out this form.S'il vous plaît emmenez-moi à cette adresse. ;Please take me to this address.S'il te plaît écris-le. ;Please write it down.Vraiment? ;Really?Juste ici. ;Right here.Juste là. ;Right there.A bientôt. ;See you later.A demain. ;See you tomorrow.A ce soir. ;See you tonight.Elle est jolie. ;She's pretty.Désolé de vous déranger. ;Sorry to bother you.Arrête! ;Stop!Tente ta chance. ;Take a chance.Réglez ça dehors. ;Take it outside.Dis-moi. ;Tell me.Merci Mademoiselle. ;Thank you miss.Merci Monsieur. ;Thank you sir.Merci beaucoup. ;Thank you very much.Merci. ;Thank you.Merci pour tout. ;Thanks for everything.Merci pour ton aide. ;Thanks for your help.Ça a l'air super. ;That looks great.Ça sent mauvais. ;That smells bad.C'est pas mal. ;That's alright.Ça suffit. ;That's enough.C'est bon. ;That's fine.C'est tout. ;That's it.Ce n'est pas juste. ;That's not fair.Ce n'est pas vrai. ;That's not right.C'est vrai. ;That's right.C'est dommage. ;That's too bad.C'est trop. ;That's too many.C'est trop. ;That's too much.Le livre est sous la table. ;The book is under the table.Ils vont revenir tout de suite. ;They'll be right back.Ce sont les mêmes. ;They're the same.Ils sont très occupés. ;They're very busy.Ça ne marche pas. ;This doesn't work.C'est très difficile. ;This is very difficult.C'est très important. ;This is very important.Essaie-le/la. ;Try it.Très bien, merci. ;Very good, thanks.Nous l'aimons beaucoup. ;We like it very much.Voudriez-vous prendre un message s'il vous plaît? ;Would you take a message please?Oui, vraiment. ;Yes, really.Vos affaires sont toutes là. ;Your things are all here.Tu es belle. ;You're beautiful.Tu es très sympa. ;You're very nice.Tu es très intelligent. ;You're very smart.
Also you don't actually use the data in your list comps so they seem a little pointless: