1
0
forked from ScoDoc/ScoDoc

replaced SuppressAccent

This commit is contained in:
Emmanuel Viennet 2021-07-12 11:54:04 +02:00
parent 2b95f6e737
commit 99279a96bd
7 changed files with 41 additions and 246 deletions

View File

@ -1,207 +0,0 @@
# -*- mode: python -*-
# -*- coding: utf-8 -*-
"""Suppression des accents d'une chaine
Source: http://wikipython.flibuste.net/moin.py/JouerAvecUnicode#head-1213938516c633958921591439c33d202244e2f4
"""
import six
_reptable = {}
def _fill_reptable():
_corresp = [
(
u"A",
[0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x0100, 0x0102, 0x0104],
),
(u"AE", [0x00C6]),
(
u"a",
[0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x0101, 0x0103, 0x0105],
),
(u"ae", [0x00E6]),
(u"C", [0x00C7, 0x0106, 0x0108, 0x010A, 0x010C]),
(u"c", [0x00E7, 0x0107, 0x0109, 0x010B, 0x010D]),
(u"D", [0x00D0, 0x010E, 0x0110]),
(u"d", [0x00F0, 0x010F, 0x0111]),
(
u"E",
[0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x0112, 0x0114, 0x0116, 0x0118, 0x011A],
),
(
u"e",
[
0x00E8,
0xE9,
0x00E9,
0x00EA,
0xEB,
0x00EB,
0x0113,
0x0115,
0x0117,
0x0119,
0x011B,
],
),
(u"G", [0x011C, 0x011E, 0x0120, 0x0122]),
(u"g", [0x011D, 0x011F, 0x0121, 0x0123]),
(u"H", [0x0124, 0x0126]),
(u"h", [0x0125, 0x0127]),
(
u"I",
[0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x0128, 0x012A, 0x012C, 0x012E, 0x0130],
),
(
u"i",
[0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x0129, 0x012B, 0x012D, 0x012F, 0x0131],
),
(u"IJ", [0x0132]),
(u"ij", [0x0133]),
(u"J", [0x0134]),
(u"j", [0x0135]),
(u"K", [0x0136]),
(u"k", [0x0137, 0x0138]),
(u"L", [0x0139, 0x013B, 0x013D, 0x013F, 0x0141]),
(u"l", [0x013A, 0x013C, 0x013E, 0x0140, 0x0142]),
(u"N", [0x00D1, 0x0143, 0x0145, 0x0147, 0x014A]),
(u"n", [0x00F1, 0x0144, 0x0146, 0x0148, 0x0149, 0x014B]),
(
u"O",
[0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D8, 0x014C, 0x014E, 0x0150],
),
(
u"o",
[0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F8, 0x014D, 0x014F, 0x0151],
),
(u"OE", [0x0152]),
(u"oe", [0x0153]),
(u"R", [0x0154, 0x0156, 0x0158]),
(u"r", [0x0155, 0x0157, 0x0159]),
(u"S", [0x015A, 0x015C, 0x015E, 0x0160]),
(u"s", [0x015B, 0x015D, 0x015F, 0x01610, 0x017F, 0x0218]),
(u"T", [0x0162, 0x0164, 0x0166]),
(u"t", [0x0163, 0x0165, 0x0167]),
(
u"U",
[
0x00D9,
0x00DA,
0x00DB,
0x00DC,
0x0168,
0x016A,
0x016C,
0x016E,
0x0170,
0x172,
],
),
(
u"u",
[
0x00F9,
0x00FA,
0x00FB,
0x00FC,
0x0169,
0x016B,
0x016D,
0x016F,
0x0171,
0xB5,
],
),
(u"W", [0x0174]),
(u"w", [0x0175]),
(u"Y", [0x00DD, 0x0176, 0x0178]),
(u"y", [0x00FD, 0x00FF, 0x0177]),
(u"Z", [0x0179, 0x017B, 0x017D]),
(u"z", [0x017A, 0x017C, 0x017E]),
(
u"",
[
0x80,
0x81,
0x82,
0x83,
0x84,
0x85,
0x86,
0x87,
0x88,
0x89,
0x8A,
0x8B,
0x8C,
0x8D,
0x8E,
0x8F,
0x90,
0x91,
0x92,
0x93,
0x94,
0x95,
0x96,
0x97,
0x98,
0x99,
0x9A,
0x9B,
0x9C,
0x9D,
0x9E,
0x9F,
],
), # misc controls
(u" ", [0x00A0]), # &nbsp
(u"!", [0xA1]), # ¡
(u"c", [0xA2]), # cent
(u"L", [0xA3]), # pound
(u"o", [0xA4]), # currency symbol
(u"Y", [0xA5]), # yen
(u"|", [0xA6]), # Broken Bar ¦
(u"S", [0xA7]), # section
(u"", [0xA8]), # diaeresis ¨
(u"", [0xA9]), # copyright
(u'"', [0xAB, 0xBA]), # &laquo;, &raquo; <<, >>
(u" ", [0xAC]), # Math Not Sign
(u"", [0xAD]), # DashPunctuation
(u"(r)", [0xAE]), # registred
(u"-", [0xAF]), # macron
(u"", [0xB0]), # degre
(u"+-", [0xB1]), # +-
(u"2", [0x00B2, 0xB2]), # deux exposant
(u"3", [0xB3]), # 3 exposant
(u".", [0xB7]), # &middot;,
(u"1/4", [0xBC]), # 1/4
(u"1/2", [0xBD]), # 1/2
(u"3/4", [0xBE]), # 3/4
(u"e", [0x20AC]), # euro
(u"--", [0x2013]), # EN DASH
(u"'", [0x2018, 0x2019, 0x201A]), # LEFT, RIGHT SINGLE QUOTATION MARK
(u" ", [0x2020]), # dagger
]
global _reptable
for repchar, codes in _corresp:
for code in codes:
_reptable[code] = repchar
_fill_reptable()
def suppression_diacritics(s):
"""Suppression des accents et autres marques.
@param s: le texte à nettoyer.
@type s: str ou unicode
@return: le texte nettoyé de ses marques diacritiques.
@rtype: unicode
"""
if isinstance(s, str):
s = six.text_type(s, "utf8", "replace")
return s.translate(_reptable)

View File

@ -739,7 +739,7 @@ _ADM_PATTERN = re.compile(r"[\W]+", re.UNICODE) # supprime tout sauf alphanum
def adm_normalize_string(s): # normalize unicode title def adm_normalize_string(s): # normalize unicode title
return scu.suppression_diacritics(_ADM_PATTERN.sub("", s.strip().lower())).replace( return scu.suppress_accents(_ADM_PATTERN.sub("", s.strip().lower())).replace(
"_", "" "_", ""
) )

View File

@ -65,7 +65,6 @@ from app.scodoc.sco_utils import (
) )
from app.scodoc.notes_log import log from app.scodoc.notes_log import log
from app.scodoc.sco_exceptions import ScoGenError from app.scodoc.sco_exceptions import ScoGenError
from .SuppressAccents import suppression_diacritics
from app.scodoc import VERSION from app.scodoc import VERSION
from .VERSION import SCOVERSION, SCONAME from .VERSION import SCOVERSION, SCONAME
import six import six
@ -132,10 +131,7 @@ def makeParas(txt, style, suppress_empty=False):
paras = r paras = r
return [Paragraph(SU(s), style) for s in paras] return [Paragraph(SU(s), style) for s in paras]
except Exception as e: except Exception as e:
if type(e) is IOError: detail = " " + str(e)
detail = " " + e.message
else:
detail = ""
log(traceback.format_exc()) log(traceback.format_exc())
log("Invalid pdf para format: %s" % txt) log("Invalid pdf para format: %s" % txt)
return [ return [

View File

@ -268,7 +268,7 @@ def get_infos_apogee_allaccents(context, nom, prenom):
"essai recup infos avec differents codages des accents" "essai recup infos avec differents codages des accents"
if nom: if nom:
unom = six.text_type(nom, scu.SCO_ENCODING) unom = six.text_type(nom, scu.SCO_ENCODING)
nom_noaccents = str(scu.suppression_diacritics(unom)) nom_noaccents = scu.suppress_accents(unom)
nom_utf8 = unom.encode("utf-8") nom_utf8 = unom.encode("utf-8")
else: else:
nom_noaccents = nom nom_noaccents = nom
@ -276,7 +276,7 @@ def get_infos_apogee_allaccents(context, nom, prenom):
if prenom: if prenom:
uprenom = six.text_type(prenom, scu.SCO_ENCODING) uprenom = six.text_type(prenom, scu.SCO_ENCODING)
prenom_noaccents = str(scu.suppression_diacritics(uprenom)) prenom_noaccents = scu.suppress_accents(uprenom)
prenom_utf8 = uprenom.encode("utf-8") prenom_utf8 = uprenom.encode("utf-8")
else: else:
prenom_noaccents = prenom prenom_noaccents = prenom

View File

@ -42,6 +42,7 @@ import six.moves._thread
import sys import sys
import time import time
import types import types
import unicodedata
import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error
import six.moves.urllib.request, six.moves.urllib.error, six.moves.urllib.parse import six.moves.urllib.request, six.moves.urllib.error, six.moves.urllib.parse
from xml.etree.ElementTree import Element from xml.etree.ElementTree import Element
@ -56,7 +57,6 @@ from scodoc_manager import sco_mgr
from config import Config from config import Config
from app.scodoc.SuppressAccents import suppression_diacritics
from app.scodoc.notes_log import log from app.scodoc.notes_log import log
from app.scodoc.sco_vdi import ApoEtapeVDI from app.scodoc.sco_vdi import ApoEtapeVDI
from app.scodoc.sco_xml import quote_xml_attr from app.scodoc.sco_xml import quote_xml_attr
@ -469,8 +469,10 @@ def stripquotes(s):
def suppress_accents(s): def suppress_accents(s):
"s is an ordinary string, encoding given by SCO_ENCODING" "remove accents and suppress non ascii characters from string s"
return str(suppression_diacritics(six.text_type(s, SCO_ENCODING))) return (
unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode(SCO_ENCODING)
)
def sanitize_string(s): def sanitize_string(s):

View File

@ -461,8 +461,7 @@ def get_user_list_xml(dept=None, start="", limit=25, REQUEST=None):
"""Returns XML list of users with name (nomplogin) starting with start. """Returns XML list of users with name (nomplogin) starting with start.
Used for forms auto-completion.""" Used for forms auto-completion."""
userlist = sco_users.get_user_list(dept=dept) userlist = sco_users.get_user_list(dept=dept)
start = scu.suppression_diacritics(unicode(start, "utf-8")) # utf8 #sco8 start = scu.suppress_accents(start).lower()
start = scu.strlower(str(start))
# TODO : à refaire avec une requete SQL #py3 # TODO : à refaire avec une requete SQL #py3
# (et en json) # (et en json)
userlist = [ userlist = [

View File

@ -7,59 +7,64 @@
XXX TODO: OBSOLETE, a moderniser (psycopg2, python 3, encoding) XXX TODO: OBSOLETE, a moderniser (psycopg2, python 3, encoding)
""" """
import pdb,os,sys,psycopg import pdb, os, sys, psycopg
import csv import csv
CSVFILENAME = '/tmp/aaa.csv' CSVFILENAME = "/tmp/aaa.csv"
formsemestre_id = 'SEM229' formsemestre_id = "SEM229"
DBCNXSTRING = 'host=localhost user=scoinfo dbname=SCOINFO password=XXX' DBCNXSTRING = "host=localhost user=scoinfo dbname=SCOINFO password=XXX"
idx_prenom = 1 idx_prenom = 1
idx_nom = 0 idx_nom = 0
# en general, pas d'accents dans le CSV # en general, pas d'accents dans le CSV
SCO_ENCODING = 'iso8859-15' SCO_ENCODING = "iso8859-15"
from SuppressAccents import suppression_diacritics # from SuppressAccents import suppression_diacritics
# XXX a revoir si ce script est utile: en python3, unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode(SCO_ENCODING)
def suppr_acc_and_ponct(s): def suppr_acc_and_ponct(s):
s = s.replace( ' ', '' ) s = s.replace(" ", "")
s = s.replace('-', ' ') s = s.replace("-", " ")
return str(suppression_diacritics( unicode(s, SCO_ENCODING) )) return str(suppression_diacritics(unicode(s, SCO_ENCODING)))
def make_key(nom, prenom): def make_key(nom, prenom):
nom = suppr_acc_and_ponct(nom).upper() nom = suppr_acc_and_ponct(nom).upper()
prenom = suppr_acc_and_ponct(prenom).upper() prenom = suppr_acc_and_ponct(prenom).upper()
return nom + ' ' + prenom[:4] return nom + " " + prenom[:4]
reader = csv.reader(open( CSVFILENAME, "rb"))
reader = csv.reader(open(CSVFILENAME, "rb"))
noms = {} noms = {}
for row in reader: for row in reader:
if row[0][0] != '#': if row[0][0] != "#":
key = make_key( row[idx_nom], row[idx_prenom]) key = make_key(row[idx_nom], row[idx_prenom])
if noms.has_key(key): if noms.has_key(key):
raise ValueError, 'duplicate key: %s' % key raise ValueError, "duplicate key: %s" % key
noms[key] = row noms[key] = row
cnx = psycopg.connect( DBCNXSTRING ) cnx = psycopg.connect(DBCNXSTRING)
cursor = cnx.cursor() cursor = cnx.cursor()
cursor.execute("select * from identite i, notes_formsemestre_inscription ins where i.etudid = ins.etudid and ins.formsemestre_id = '%s'" %formsemestre_id ) cursor.execute(
"select * from identite i, notes_formsemestre_inscription ins where i.etudid = ins.etudid and ins.formsemestre_id = '%s'"
% formsemestre_id
)
R = cursor.dictfetchall() R = cursor.dictfetchall()
nok=0 nok = 0
print 'nom,prenom,ine,nip' print "nom,prenom,ine,nip"
for e in R: for e in R:
key = make_key(e['nom'], e['prenom']) key = make_key(e["nom"], e["prenom"])
if not noms.has_key(key): if not noms.has_key(key):
print '** no match for %s (%s)' % (key, e['etudid']) print "** no match for %s (%s)" % (key, e["etudid"])
else: else:
info = noms[key] info = noms[key]
print '%s,%s,%s,%s' % (e['nom'],e['prenom'], e['code_ine'], e['code_nip']) print "%s,%s,%s,%s" % (e["nom"], e["prenom"], e["code_ine"], e["code_nip"])
nok+=1 nok += 1
cnx.commit() cnx.commit()
print '%d etudiants, %d ok' % (len(R), nok) print "%d etudiants, %d ok" % (len(R), nok)