From 99279a96bd1878eb5f45711d158cc1bb54587774 Mon Sep 17 00:00:00 2001 From: Emmanuel Viennet Date: Mon, 12 Jul 2021 11:54:04 +0200 Subject: [PATCH] replaced SuppressAccent --- app/scodoc/SuppressAccents.py | 207 -------------------------------- app/scodoc/sco_import_etuds.py | 2 +- app/scodoc/sco_pdf.py | 6 +- app/scodoc/sco_portal_apogee.py | 4 +- app/scodoc/sco_utils.py | 8 +- app/views/users.py | 3 +- misc/get_codes_from_names.py | 57 +++++---- 7 files changed, 41 insertions(+), 246 deletions(-) delete mode 100644 app/scodoc/SuppressAccents.py diff --git a/app/scodoc/SuppressAccents.py b/app/scodoc/SuppressAccents.py deleted file mode 100644 index 18332fbb6..000000000 --- a/app/scodoc/SuppressAccents.py +++ /dev/null @@ -1,207 +0,0 @@ -# -*- mode: python -*- -# -*- coding: utf-8 -*- - -"""Suppression des accents d'une chaine - -Source: http://wikipython.flibuste.net/moin.py/JouerAvecUnicode#head-1213938516c633958921591439c33d202244e2f4 -""" -import six - -_reptable = {} - - -def _fill_reptable(): - _corresp = [ - ( - u"A", - [0x00C0, 0x00C1, 0x00C2, 0x00C3, 0x00C4, 0x00C5, 0x0100, 0x0102, 0x0104], - ), - (u"AE", [0x00C6]), - ( - u"a", - [0x00E0, 0x00E1, 0x00E2, 0x00E3, 0x00E4, 0x00E5, 0x0101, 0x0103, 0x0105], - ), - (u"ae", [0x00E6]), - (u"C", [0x00C7, 0x0106, 0x0108, 0x010A, 0x010C]), - (u"c", [0x00E7, 0x0107, 0x0109, 0x010B, 0x010D]), - (u"D", [0x00D0, 0x010E, 0x0110]), - (u"d", [0x00F0, 0x010F, 0x0111]), - ( - u"E", - [0x00C8, 0x00C9, 0x00CA, 0x00CB, 0x0112, 0x0114, 0x0116, 0x0118, 0x011A], - ), - ( - u"e", - [ - 0x00E8, - 0xE9, - 0x00E9, - 0x00EA, - 0xEB, - 0x00EB, - 0x0113, - 0x0115, - 0x0117, - 0x0119, - 0x011B, - ], - ), - (u"G", [0x011C, 0x011E, 0x0120, 0x0122]), - (u"g", [0x011D, 0x011F, 0x0121, 0x0123]), - (u"H", [0x0124, 0x0126]), - (u"h", [0x0125, 0x0127]), - ( - u"I", - [0x00CC, 0x00CD, 0x00CE, 0x00CF, 0x0128, 0x012A, 0x012C, 0x012E, 0x0130], - ), - ( - u"i", - [0x00EC, 0x00ED, 0x00EE, 0x00EF, 0x0129, 0x012B, 0x012D, 0x012F, 0x0131], - ), - (u"IJ", [0x0132]), - (u"ij", [0x0133]), - (u"J", [0x0134]), - (u"j", [0x0135]), - (u"K", [0x0136]), - (u"k", [0x0137, 0x0138]), - (u"L", [0x0139, 0x013B, 0x013D, 0x013F, 0x0141]), - (u"l", [0x013A, 0x013C, 0x013E, 0x0140, 0x0142]), - (u"N", [0x00D1, 0x0143, 0x0145, 0x0147, 0x014A]), - (u"n", [0x00F1, 0x0144, 0x0146, 0x0148, 0x0149, 0x014B]), - ( - u"O", - [0x00D2, 0x00D3, 0x00D4, 0x00D5, 0x00D6, 0x00D8, 0x014C, 0x014E, 0x0150], - ), - ( - u"o", - [0x00F2, 0x00F3, 0x00F4, 0x00F5, 0x00F6, 0x00F8, 0x014D, 0x014F, 0x0151], - ), - (u"OE", [0x0152]), - (u"oe", [0x0153]), - (u"R", [0x0154, 0x0156, 0x0158]), - (u"r", [0x0155, 0x0157, 0x0159]), - (u"S", [0x015A, 0x015C, 0x015E, 0x0160]), - (u"s", [0x015B, 0x015D, 0x015F, 0x01610, 0x017F, 0x0218]), - (u"T", [0x0162, 0x0164, 0x0166]), - (u"t", [0x0163, 0x0165, 0x0167]), - ( - u"U", - [ - 0x00D9, - 0x00DA, - 0x00DB, - 0x00DC, - 0x0168, - 0x016A, - 0x016C, - 0x016E, - 0x0170, - 0x172, - ], - ), - ( - u"u", - [ - 0x00F9, - 0x00FA, - 0x00FB, - 0x00FC, - 0x0169, - 0x016B, - 0x016D, - 0x016F, - 0x0171, - 0xB5, - ], - ), - (u"W", [0x0174]), - (u"w", [0x0175]), - (u"Y", [0x00DD, 0x0176, 0x0178]), - (u"y", [0x00FD, 0x00FF, 0x0177]), - (u"Z", [0x0179, 0x017B, 0x017D]), - (u"z", [0x017A, 0x017C, 0x017E]), - ( - u"", - [ - 0x80, - 0x81, - 0x82, - 0x83, - 0x84, - 0x85, - 0x86, - 0x87, - 0x88, - 0x89, - 0x8A, - 0x8B, - 0x8C, - 0x8D, - 0x8E, - 0x8F, - 0x90, - 0x91, - 0x92, - 0x93, - 0x94, - 0x95, - 0x96, - 0x97, - 0x98, - 0x99, - 0x9A, - 0x9B, - 0x9C, - 0x9D, - 0x9E, - 0x9F, - ], - ), # misc controls - (u" ", [0x00A0]), #   - (u"!", [0xA1]), # ¡ - (u"c", [0xA2]), # cent - (u"L", [0xA3]), # pound - (u"o", [0xA4]), # currency symbol - (u"Y", [0xA5]), # yen - (u"|", [0xA6]), # Broken Bar ¦ - (u"S", [0xA7]), # section - (u"", [0xA8]), # diaeresis ¨ - (u"", [0xA9]), # copyright - (u'"', [0xAB, 0xBA]), # «, » <<, >> - (u" ", [0xAC]), # Math Not Sign - (u"", [0xAD]), # DashPunctuation - (u"(r)", [0xAE]), # registred - (u"-", [0xAF]), # macron - (u"", [0xB0]), # degre - (u"+-", [0xB1]), # +- - (u"2", [0x00B2, 0xB2]), # deux exposant - (u"3", [0xB3]), # 3 exposant - (u".", [0xB7]), # ·, - (u"1/4", [0xBC]), # 1/4 - (u"1/2", [0xBD]), # 1/2 - (u"3/4", [0xBE]), # 3/4 - (u"e", [0x20AC]), # euro - (u"--", [0x2013]), # EN DASH - (u"'", [0x2018, 0x2019, 0x201A]), # LEFT, RIGHT SINGLE QUOTATION MARK - (u" ", [0x2020]), # dagger - ] - global _reptable - for repchar, codes in _corresp: - for code in codes: - _reptable[code] = repchar - - -_fill_reptable() - - -def suppression_diacritics(s): - """Suppression des accents et autres marques. - - @param s: le texte à nettoyer. - @type s: str ou unicode - @return: le texte nettoyé de ses marques diacritiques. - @rtype: unicode - """ - if isinstance(s, str): - s = six.text_type(s, "utf8", "replace") - return s.translate(_reptable) diff --git a/app/scodoc/sco_import_etuds.py b/app/scodoc/sco_import_etuds.py index 2d8732daf..a37aacbf3 100644 --- a/app/scodoc/sco_import_etuds.py +++ b/app/scodoc/sco_import_etuds.py @@ -739,7 +739,7 @@ _ADM_PATTERN = re.compile(r"[\W]+", re.UNICODE) # supprime tout sauf alphanum def adm_normalize_string(s): # normalize unicode title - return scu.suppression_diacritics(_ADM_PATTERN.sub("", s.strip().lower())).replace( + return scu.suppress_accents(_ADM_PATTERN.sub("", s.strip().lower())).replace( "_", "" ) diff --git a/app/scodoc/sco_pdf.py b/app/scodoc/sco_pdf.py index 66050dff3..2efe95797 100755 --- a/app/scodoc/sco_pdf.py +++ b/app/scodoc/sco_pdf.py @@ -65,7 +65,6 @@ from app.scodoc.sco_utils import ( ) from app.scodoc.notes_log import log from app.scodoc.sco_exceptions import ScoGenError -from .SuppressAccents import suppression_diacritics from app.scodoc import VERSION from .VERSION import SCOVERSION, SCONAME import six @@ -132,10 +131,7 @@ def makeParas(txt, style, suppress_empty=False): paras = r return [Paragraph(SU(s), style) for s in paras] except Exception as e: - if type(e) is IOError: - detail = " " + e.message - else: - detail = "" + detail = " " + str(e) log(traceback.format_exc()) log("Invalid pdf para format: %s" % txt) return [ diff --git a/app/scodoc/sco_portal_apogee.py b/app/scodoc/sco_portal_apogee.py index a7a737c0d..898b79c31 100644 --- a/app/scodoc/sco_portal_apogee.py +++ b/app/scodoc/sco_portal_apogee.py @@ -268,7 +268,7 @@ def get_infos_apogee_allaccents(context, nom, prenom): "essai recup infos avec differents codages des accents" if nom: unom = six.text_type(nom, scu.SCO_ENCODING) - nom_noaccents = str(scu.suppression_diacritics(unom)) + nom_noaccents = scu.suppress_accents(unom) nom_utf8 = unom.encode("utf-8") else: nom_noaccents = nom @@ -276,7 +276,7 @@ def get_infos_apogee_allaccents(context, nom, prenom): if prenom: uprenom = six.text_type(prenom, scu.SCO_ENCODING) - prenom_noaccents = str(scu.suppression_diacritics(uprenom)) + prenom_noaccents = scu.suppress_accents(uprenom) prenom_utf8 = uprenom.encode("utf-8") else: prenom_noaccents = prenom diff --git a/app/scodoc/sco_utils.py b/app/scodoc/sco_utils.py index 819a37e9f..d7b28385c 100644 --- a/app/scodoc/sco_utils.py +++ b/app/scodoc/sco_utils.py @@ -42,6 +42,7 @@ import six.moves._thread import sys import time import types +import unicodedata import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error import six.moves.urllib.request, six.moves.urllib.error, six.moves.urllib.parse from xml.etree.ElementTree import Element @@ -56,7 +57,6 @@ from scodoc_manager import sco_mgr from config import Config -from app.scodoc.SuppressAccents import suppression_diacritics from app.scodoc.notes_log import log from app.scodoc.sco_vdi import ApoEtapeVDI from app.scodoc.sco_xml import quote_xml_attr @@ -469,8 +469,10 @@ def stripquotes(s): def suppress_accents(s): - "s is an ordinary string, encoding given by SCO_ENCODING" - return str(suppression_diacritics(six.text_type(s, SCO_ENCODING))) + "remove accents and suppress non ascii characters from string s" + return ( + unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode(SCO_ENCODING) + ) def sanitize_string(s): diff --git a/app/views/users.py b/app/views/users.py index c57cbddc6..58888eb51 100644 --- a/app/views/users.py +++ b/app/views/users.py @@ -461,8 +461,7 @@ def get_user_list_xml(dept=None, start="", limit=25, REQUEST=None): """Returns XML list of users with name (nomplogin) starting with start. Used for forms auto-completion.""" userlist = sco_users.get_user_list(dept=dept) - start = scu.suppression_diacritics(unicode(start, "utf-8")) # utf8 #sco8 - start = scu.strlower(str(start)) + start = scu.suppress_accents(start).lower() # TODO : à refaire avec une requete SQL #py3 # (et en json) userlist = [ diff --git a/misc/get_codes_from_names.py b/misc/get_codes_from_names.py index 6cc8bc49a..947561a85 100644 --- a/misc/get_codes_from_names.py +++ b/misc/get_codes_from_names.py @@ -7,59 +7,64 @@ XXX TODO: OBSOLETE, a moderniser (psycopg2, python 3, encoding) """ -import pdb,os,sys,psycopg +import pdb, os, sys, psycopg import csv -CSVFILENAME = '/tmp/aaa.csv' -formsemestre_id = 'SEM229' -DBCNXSTRING = 'host=localhost user=scoinfo dbname=SCOINFO password=XXX' +CSVFILENAME = "/tmp/aaa.csv" +formsemestre_id = "SEM229" +DBCNXSTRING = "host=localhost user=scoinfo dbname=SCOINFO password=XXX" idx_prenom = 1 idx_nom = 0 - - # en general, pas d'accents dans le CSV -SCO_ENCODING = 'iso8859-15' -from SuppressAccents import suppression_diacritics +SCO_ENCODING = "iso8859-15" +# from SuppressAccents import suppression_diacritics + +# XXX a revoir si ce script est utile: en python3, unicodedata.normalize("NFD", s).encode("ascii", "ignore").decode(SCO_ENCODING) def suppr_acc_and_ponct(s): - s = s.replace( ' ', '' ) - s = s.replace('-', ' ') - return str(suppression_diacritics( unicode(s, SCO_ENCODING) )) + s = s.replace(" ", "") + s = s.replace("-", " ") + return str(suppression_diacritics(unicode(s, SCO_ENCODING))) + def make_key(nom, prenom): - nom = suppr_acc_and_ponct(nom).upper() + nom = suppr_acc_and_ponct(nom).upper() prenom = suppr_acc_and_ponct(prenom).upper() - return nom + ' ' + prenom[:4] + return nom + " " + prenom[:4] -reader = csv.reader(open( CSVFILENAME, "rb")) + +reader = csv.reader(open(CSVFILENAME, "rb")) noms = {} for row in reader: - if row[0][0] != '#': - key = make_key( row[idx_nom], row[idx_prenom]) + if row[0][0] != "#": + key = make_key(row[idx_nom], row[idx_prenom]) if noms.has_key(key): - raise ValueError, 'duplicate key: %s' % key + raise ValueError, "duplicate key: %s" % key noms[key] = row -cnx = psycopg.connect( DBCNXSTRING ) +cnx = psycopg.connect(DBCNXSTRING) cursor = cnx.cursor() -cursor.execute("select * from identite i, notes_formsemestre_inscription ins where i.etudid = ins.etudid and ins.formsemestre_id = '%s'" %formsemestre_id ) +cursor.execute( + "select * from identite i, notes_formsemestre_inscription ins where i.etudid = ins.etudid and ins.formsemestre_id = '%s'" + % formsemestre_id +) R = cursor.dictfetchall() -nok=0 -print 'nom,prenom,ine,nip' +nok = 0 +print "nom,prenom,ine,nip" for e in R: - key = make_key(e['nom'], e['prenom']) + key = make_key(e["nom"], e["prenom"]) if not noms.has_key(key): - print '** no match for %s (%s)' % (key, e['etudid']) + print "** no match for %s (%s)" % (key, e["etudid"]) else: info = noms[key] - print '%s,%s,%s,%s' % (e['nom'],e['prenom'], e['code_ine'], e['code_nip']) - nok+=1 + print "%s,%s,%s,%s" % (e["nom"], e["prenom"], e["code_ine"], e["code_nip"]) + nok += 1 cnx.commit() -print '%d etudiants, %d ok' % (len(R), nok) +print "%d etudiants, %d ok" % (len(R), nok)