#!/usr/bin/env python # -*- coding: windows-1251 -*- # Copyright (C) 2005 Roman V. Kiseliov # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # 3. All advertising materials mentioning features or use of this # software must display the following acknowledgment: # "This product includes software developed by # Roman V. Kiseliov ." # # 4. Redistributions of any form whatsoever must retain the following # acknowledgment: # "This product includes software developed by # Roman V. Kiseliov ." # # THIS SOFTWARE IS PROVIDED BY Roman V. Kiseliov ``AS IS'' AND ANY # EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL Roman V. Kiseliov OR # ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED # OF THE POSSIBILITY OF SUCH DAMAGE. __rev_id__ = """$Id: ImportXLS.py,v 1.6 2005/10/26 07:44:24 rvk Exp $""" import UnicodeUtils import CompoundDoc import ExcelMagic from struct import pack, unpack def parse_xls(filename, encoding = None, doc=None): # XXX added doc arg. ########################################################################## def process_BOUNDSHEET(biff8, rec_data): sheet_stream_pos, visibility, sheet_type = unpack('>= 2 else: ieee754 = struct.pack('8B', 0, 0, 0, 0, b0 & 0xFC, b1, b2, b3) result , = unpack('= len(curr_block): curr_block_num += 1 curr_block = sst_continues[curr_block_num] pos = 0 chars_num, options = unpack('= len(curr_block): curr_block_num += 1 curr_block = sst_continues[curr_block_num] options = ord(curr_block[0]) pos = 1 #print curr_block_num compressed = (options & 0x01) == 0 if compressed: chars_end = pos + chars_num - curr_char else: chars_end = pos + 2*(chars_num - curr_char) #print compressed, has_asian_phonetic, has_format_runs splitted = chars_end > len(curr_block) if splitted: chars_end = len(curr_block) #print splitted, curr_char, pos, chars_end, repr(curr_block[pos:chars_end]) if compressed: result += curr_block[pos:chars_end].decode('latin_1', 'replace') else: result += curr_block[pos:chars_end].decode('utf_16_le', 'replace') pos = chars_end curr_char = len(result) # end while # TODO: handle spanning format runs over CONTINUE blocks ??? tail_size = 4*runs_num + asian_phonetic_size if len(curr_block) < pos + tail_size: pos = pos + tail_size - len(curr_block) curr_block_num += 1 curr_block = sst_continues[curr_block_num] else: pos += tail_size #print result.encode('cp866') SST[curr_str_num] = result curr_str_num += 1 return SST ##################################################################################### import struct encodings = { 0x016F: 'ascii', #ASCII 0x01B5: 'cp437', #IBM PC CP-437 (US) 0x02D0: 'cp720', #IBM PC CP-720 (OEM Arabic) 0x02E1: 'cp737', #IBM PC CP-737 (Greek) 0x0307: 'cp775', #IBM PC CP-775 (Baltic) 0x0352: 'cp850', #IBM PC CP-850 (Latin I) 0x0354: 'cp852', #IBM PC CP-852 (Latin II (Central European)) 0x0357: 'cp855', #IBM PC CP-855 (Cyrillic) 0x0359: 'cp857', #IBM PC CP-857 (Turkish) 0x035A: 'cp858', #IBM PC CP-858 (Multilingual Latin I with Euro) 0x035C: 'cp860', #IBM PC CP-860 (Portuguese) 0x035D: 'cp861', #IBM PC CP-861 (Icelandic) 0x035E: 'cp862', #IBM PC CP-862 (Hebrew) 0x035F: 'cp863', #IBM PC CP-863 (Canadian (French)) 0x0360: 'cp864', #IBM PC CP-864 (Arabic) 0x0361: 'cp865', #IBM PC CP-865 (Nordic) 0x0362: 'cp866', #IBM PC CP-866 (Cyrillic (Russian)) 0x0365: 'cp869', #IBM PC CP-869 (Greek (Modern)) 0x036A: 'cp874', #Windows CP-874 (Thai) 0x03A4: 'cp932', #Windows CP-932 (Japanese Shift-JIS) 0x03A8: 'cp936', #Windows CP-936 (Chinese Simplified GBK) 0x03B5: 'cp949', #Windows CP-949 (Korean (Wansung)) 0x03B6: 'cp950', #Windows CP-950 (Chinese Traditional BIG5) 0x04B0: 'utf_16_le', #UTF-16 (BIFF8) 0x04E2: 'cp1250', #Windows CP-1250 (Latin II) (Central European) 0x04E3: 'cp1251', #Windows CP-1251 (Cyrillic) 0x04E4: 'cp1252', #Windows CP-1252 (Latin I) (BIFF4-BIFF7) 0x04E5: 'cp1253', #Windows CP-1253 (Greek) 0x04E6: 'cp1254', #Windows CP-1254 (Turkish) 0x04E7: 'cp1255', #Windows CP-1255 (Hebrew) 0x04E8: 'cp1256', #Windows CP-1256 (Arabic) 0x04E9: 'cp1257', #Windows CP-1257 (Baltic) 0x04EA: 'cp1258', #Windows CP-1258 (Vietnamese) 0x0551: 'cp1361', #Windows CP-1361 (Korean (Johab)) 0x2710: 'mac_roman', #Apple Roman 0x8000: 'mac_roman', #Apple Roman 0x8001: 'cp1252' #Windows CP-1252 (Latin I) (BIFF2-BIFF3) } biff8 = True SST = {} sheets = [] sheet_names = [] values = {} ws_num = 0 BOFs = 0 EOFs = 0 # Inside MS Office document looks like filesystem # We need extract stream named 'Workbook' or 'Book' ole_streams = CompoundDoc.Reader(filename, doc=doc).STREAMS if 'Workbook' in ole_streams: workbook_stream = ole_streams['Workbook'] elif 'Book' in ole_streams: workbook_stream = ole_streams['Book'] else: raise Exception, 'No workbook stream in file.' workbook_stream_len = len(workbook_stream) stream_pos = 0 # Excel's method of data storing is based on # ancient technology "TLV" (Type, Length, Value). # In addition, if record size grows to some limit # Excel writes CONTINUE records while stream_pos < workbook_stream_len and EOFs <= ws_num: rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4]) stream_pos += 4 rec_data = workbook_stream[stream_pos:stream_pos+data_size] stream_pos += data_size if rec_id == 0x0809: # BOF #print 'BOF', BOFs += 1 ver, substream_type = unpack('<2H', rec_data[:4]) if substream_type == 0x0005: # workbook global substream biff8 = ver >= 0x0600 elif substream_type == 0x0010: # worksheet substream pass else: # skip chart stream or unknown stream # stream offsets may be used from BOUNDSHEET record rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4]) while rec_id != 0x000A: # EOF #print 'SST CONTINUE' stream_pos += 4 stream_pos += data_size rec_id, data_size = unpack('<2H', workbook_stream[stream_pos:stream_pos+4]) #print 'BIFF8 == ', biff8 elif rec_id == 0x000A: # EOF #print 'EOF' if BOFs > 1: sheets.extend([values]) values = {} EOFs += 1 elif rec_id == 0x0042: # CODEPAGE cp , = unpack('