forked from ScoDoc/ScoDoc
136 lines
5.2 KiB
Python
136 lines
5.2 KiB
Python
|
#!/usr/bin/env python
|
|||
|
# -*- coding: windows-1251 -*-
|
|||
|
|
|||
|
# Copyright (C) 2005 Roman V. Kiseliov
|
|||
|
# All rights reserved.
|
|||
|
#
|
|||
|
# Redistribution and use in source and binary forms, with or without
|
|||
|
# modification, are permitted provided that the following conditions
|
|||
|
# are met:
|
|||
|
#
|
|||
|
# 1. Redistributions of source code must retain the above copyright
|
|||
|
# notice, this list of conditions and the following disclaimer.
|
|||
|
#
|
|||
|
# 2. Redistributions in binary form must reproduce the above copyright
|
|||
|
# notice, this list of conditions and the following disclaimer in
|
|||
|
# the documentation and/or other materials provided with the
|
|||
|
# distribution.
|
|||
|
#
|
|||
|
# 3. All advertising materials mentioning features or use of this
|
|||
|
# software must display the following acknowledgment:
|
|||
|
# "This product includes software developed by
|
|||
|
# Roman V. Kiseliov <roman@kiseliov.ru>."
|
|||
|
#
|
|||
|
# 4. Redistributions of any form whatsoever must retain the following
|
|||
|
# acknowledgment:
|
|||
|
# "This product includes software developed by
|
|||
|
# Roman V. Kiseliov <roman@kiseliov.ru>."
|
|||
|
#
|
|||
|
# THIS SOFTWARE IS PROVIDED BY Roman V. Kiseliov ``AS IS'' AND ANY
|
|||
|
# EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|||
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|||
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Roman V. Kiseliov OR
|
|||
|
# ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|||
|
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
|
|||
|
# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|||
|
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|||
|
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
|||
|
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|||
|
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
|||
|
# OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
|
|||
|
|
|||
|
'''
|
|||
|
From BIFF8 on, strings are always stored using UTF-16LE text encoding. The
|
|||
|
character array is a sequence of 16-bit values4. Additionally it is
|
|||
|
possible to use a compressed format, which omits the high bytes of all
|
|||
|
characters, if they are all zero.
|
|||
|
|
|||
|
The following tables describe the standard format of the entire string, but
|
|||
|
in many records the strings differ from this format. This will be mentioned
|
|||
|
separately. It is possible (but not required) to store Rich-Text formatting
|
|||
|
information and Asian phonetic information inside a Unicode string. This
|
|||
|
results in four different ways to store a string. The character array
|
|||
|
is not zero-terminated.
|
|||
|
|
|||
|
The string consists of the character count (as usual an 8-bit value or
|
|||
|
a 16-bit value), option flags, the character array and optional formatting
|
|||
|
information. If the string is empty, sometimes the option flags field will
|
|||
|
not occur. This is mentioned at the respective place.
|
|||
|
|
|||
|
Offset Size Contents
|
|||
|
0 1 or 2 Length of the string (character count, ln)
|
|||
|
1 or 2 1 Option flags:
|
|||
|
Bit Mask Contents
|
|||
|
0 01H Character compression (ccompr):
|
|||
|
0 = Compressed (8-bit characters)
|
|||
|
1 = Uncompressed (16-bit characters)
|
|||
|
2 04H Asian phonetic settings (phonetic):
|
|||
|
0 = Does not contain Asian phonetic settings
|
|||
|
1 = Contains Asian phonetic settings
|
|||
|
3 08H Rich-Text settings (richtext):
|
|||
|
0 = Does not contain Rich-Text settings
|
|||
|
1 = Contains Rich-Text settings
|
|||
|
[2 or 3] 2 (optional, only if richtext=1) Number of Rich-Text formatting runs (rt)
|
|||
|
[var.] 4 (optional, only if phonetic=1) Size of Asian phonetic settings block (in bytes, sz)
|
|||
|
var. ln or
|
|||
|
2<EFBFBD>ln Character array (8-bit characters or 16-bit characters, dependent on ccompr)
|
|||
|
[var.] 4<EFBFBD>rt (optional, only if richtext=1) List of rt formatting runs
|
|||
|
[var.] sz (optional, only if phonetic=1) Asian Phonetic Settings Block
|
|||
|
'''
|
|||
|
|
|||
|
|
|||
|
__rev_id__ = """$Id: UnicodeUtils.py,v 1.4 2005/07/20 07:24:11 rvk Exp $"""
|
|||
|
|
|||
|
|
|||
|
import struct
|
|||
|
|
|||
|
|
|||
|
DEFAULT_ENCODING = 'cp1251'
|
|||
|
|
|||
|
def u2ints(ustr):
|
|||
|
ints = [ord(uchr) for uchr in ustr]
|
|||
|
return ints
|
|||
|
|
|||
|
def u2bytes(ustr):
|
|||
|
ints = u2ints(ustr)
|
|||
|
return struct.pack('<' + 'H'*len(ints), *ints)
|
|||
|
|
|||
|
def upack2(_str):
|
|||
|
try:
|
|||
|
ustr = u2bytes(unicode(_str, 'ascii'))
|
|||
|
return struct.pack('<HB', len(_str), 0) + _str
|
|||
|
except:
|
|||
|
if isinstance(_str, unicode):
|
|||
|
ustr = u2bytes(_str)
|
|||
|
else:
|
|||
|
ustr = u2bytes(unicode(_str, DEFAULT_ENCODING))
|
|||
|
return struct.pack('<HB', len(_str), 1) + ustr
|
|||
|
|
|||
|
def upack1(_str):
|
|||
|
try:
|
|||
|
ustr = u2bytes(unicode(_str, 'ascii'))
|
|||
|
return struct.pack('BB', len(_str), 0) + _str
|
|||
|
except:
|
|||
|
if isinstance(_str, unicode):
|
|||
|
ustr = u2bytes(_str)
|
|||
|
else:
|
|||
|
ustr = u2bytes(unicode(_str, DEFAULT_ENCODING))
|
|||
|
return struct.pack('BB', len(_str), 1) + ustr
|
|||
|
|
|||
|
if __name__ == '__main__':
|
|||
|
f = file('out0.bin', 'wb')
|
|||
|
f.write(u2bytes('<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: unicode'))
|
|||
|
f.close()
|
|||
|
|
|||
|
f = file('out1.bin', 'wb')
|
|||
|
f.write(upack1('<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: unicode'))
|
|||
|
f.close()
|
|||
|
|
|||
|
f = file('out2.bin', 'wb')
|
|||
|
f.write(upack2('<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>: unicode'))
|
|||
|
f.close()
|
|||
|
|
|||
|
|
|||
|
|