0001 """ 0002 DSV.py - Cliff Wells, 2002 0003 Import/export DSV (delimiter separated values, a generalization of CSV). 0004 0005 $Id: DSV.py 3878 2007-01-09 22:28:37Z djpham $ 0006 Modified by Joe Pham <djpham@bitpim.org> to accommodate wxPython 2.8+ 0007 0008 Basic use: 0009 0010 from DSV import DSV 0011 0012 data = file.read() 0013 qualifier = DSV.guessTextQualifier(data) # optional 0014 data = DSV.organizeIntoLines(data, textQualifier = qualifier) 0015 delimiter = DSV.guessDelimiter(data) # optional 0016 data = DSV.importDSV(data, delimiter = delimiter, textQualifier = qualifier) 0017 hasHeader = DSV.guessHeaders(data) # optional 0018 0019 If you know the delimiters, qualifiers, etc, you may skip the optional 0020 'guessing' steps as they rely on heuristics anyway (although they seem 0021 to work well, there is no guarantee they are correct). What they are 0022 best used for is to make a good guess regarding the data structure and then 0023 let the user confirm it. 0024 0025 As such there is a 'wizard' to aid in this process (use this in lieu of 0026 the above code - requires wxPython): 0027 0028 from DSV import DSV 0029 0030 dlg = DSV.ImportWizardDialog(parent, -1, 'DSV Import Wizard', filename) 0031 dlg.ShowModal() 0032 headers, data = dlg.ImportData() # may also return None 0033 dlg.Destroy() 0034 0035 The dlg.ImportData() method may also take a function as an optional argument 0036 specifying what it should do about malformed rows. See the example at the bottom 0037 of this file. A few common functions are provided in this file (padRow, skipRow, 0038 useRow). 0039 0040 Requires Python 2.0 or later 0041 Wizards tested with wxPython 2.2.5/NT 4.0, 2.3.2/Win2000 and Linux/GTK (RedHat 7.x) 0042 """ 0043 0044 __version__ = '1.4' 0045 0046 """ 0047 Bugs/Caveats: 0048 - Although I've tested this stuff on varied data, I'm sure there are cases 0049 that I haven't seen that will choke any one of these routines (or at least 0050 return invalid data). This is beta code! 0051 - guessTextQualifier() algorithm is limited to quotes (double or single). 0052 - Surprising feature: Hitting <enter> on wxSpinCtrl causes seg 0053 fault under Linux/GTK (not Win32). Strangely, pressing <tab> seems ok. 0054 Therefore, I had to use wxSpinButton. Also, spurious spin events get 0055 generated for both of these controls (e.g. when calling wxBeginBusyCursor) 0056 - Keyboard navigation needs to be implemented on wizards 0057 - There may be issues with cr/lf translation, although I haven't yet seen any. 0058 0059 Why another CSV tool?: 0060 - Because I needed a more flexible CSV importer, one that could accept different 0061 delimiters (not just commas or tabs), one that could make an intelligent guess 0062 regarding file structure (for user convenience), be compatible with the files 0063 output by MS Excel, and finally, be easily integrated with a wizard. All of the 0064 modules I have seen prior to this fell short on one count or another. 0065 - It seemed interesting. 0066 0067 To do: 0068 - Better guessTextQualifier() algorithm. In the perfect world I envision, I can 0069 use any character as a text qualifier, not just quotes. 0070 - Finish wizards and move them into separate module. 0071 - Better guessHeaders() algorithm, although this is difficult. 0072 - Optimize maps() - try to eliminate lambda when possible 0073 - Optimize memory usage. Presently the entire file is loaded and then saved as 0074 a list. A better approach might be to analyze a smaller part of the file and 0075 then return an iterator to step through it. 0076 """ 0077 0078 # Changelog 0079 # 1.4: 0080 # - Fixed small bug in demo (forgotten Destroy()) that caused a hang 0081 # when Cancel was pressed. 0082 # - Removed extraneous guessHeaders() call in wizard. I can only say, 0083 # "what the??" and remember to profile. This was a huge time waster. 0084 # 0085 # 1.3.9 0086 # - Fixed real problem on Win32 in that wxProgressDialog must reach max 0087 # value in order to close. 0088 # 0089 # 1.3.8 0090 # - Change demo to use wxApp rather than wxPySimpleApp as it seemed 0091 # to have problems on Win32 (per Kevin Altis) 0092 # 0093 # 1.37 0094 # - Fix for font issue under GTK2 (thanks to Ahmad Baitalmal) 0095 # - Added some space below the Ok/Cancel buttons. 0096 # 0097 # 1.36 0098 # - Bugfix submitted by "nobody" ;) on SF 0099 # 0100 # 1.3.4 to 1.3.5: 0101 # - Nigel Hathaway finds yet another bug (or two). Can't seem to make him 0102 # use something else, so they had to be fixed. It's especially difficult 0103 # to ignore him since he provided the fix. Very annoying. 0104 # - Problem with odd quote/delimiter combinations (SF bug #620284) 0105 # - Losing empty fields at beginning/end (#619771) 0106 # - Whitespace stripped from around string (#620115) 0107 # 0108 # 1.3.3 to 1.3.4(a): 0109 # - Fixed bug in exportDSV that failed to quote data containing delimiter 0110 # thanks to nhathaway@users.sourceforge.net 0111 # 0112 # 1.3 to 1.3.1: 0113 # - Test for presence of wxPython (since it's not required except for wizard) 0114 # - Changed "from wxPython.wx import *" to "from wxPython import wx" 0115 # - Changed sample csv file (darkwave.csv) to demonstrate embedded quotes 0116 0117 import sys 0118 # import pre as re # sre was broken, appears okay now. Try this if there are problems. 0119 import re 0120 import copy 0121 import exceptions 0122 import string 0123 # RedHat 8.0 (or rather GTK2?) sets LANG = en_us.UTF-8 and apparently some 0124 # older apps (including wxGTK) can't handle this. The fix is to set LANG=C 0125 # before running the app. Thanks to Ahmad Baitalmal for supplying this info. 0126 import os 0127 os.putenv('LANG', 'C') 0128 0129 ##try: 0130 ## from wxPython import wx, grid 0131 ##except ImportError: 0132 ## wx = None 0133 0134 try: 0135 import wx 0136 import wx.grid as gridlib 0137 except ImportError: 0138 wx = None 0139 0140 class InvalidDelimiter(exceptions.StandardError): pass 0141 class InvalidTextQualifier(exceptions.StandardError): pass 0142 class InvalidData(exceptions.StandardError): pass 0143 class InvalidNumberOfColumns(exceptions.StandardError): pass 0144 0145 # ------------------------------------------------------------------------------ 0146 def guessTextQualifier(input): 0147 """ 0148 PROTOTYPE: 0149 guessTextQualifier(input) 0150 DESCRIPTION: 0151 tries to guess if the text qualifier (a character delimiting ambiguous data) 0152 is a single or double-quote (or None) 0153 ARGUMENTS: 0154 - input is raw data as a string 0155 RETURNS: 0156 single character or None 0157 """ 0158 0159 # Algorithm: looks for text enclosed between two identical quotes (the probable 0160 # qualifier) which are preceded and followed by the same character (the 0161 # probable delimiter), for example: 0162 # ,'some text', 0163 # The quote with the most wins. 0164 0165 data = input[:16 * 1024] # limit sample to 16k 0166 0167 regexp = re.compile('(?:(?:^|\n)(?P<b_quote>["\']).*?(?P=b_quote))|' 0168 '(?:(?P<delim>.)(?P<quote>["\']).*?(?P=quote)(?=(?P=delim)|\n))|' 0169 '(?:(?P<e_quote>["\']).*?(?P=e_quote)$)', re.M | re.S) 0170 matches = filter(lambda i: reduce(lambda a, b: a + b, i), regexp.findall(data)) 0171 if not matches: return None 0172 0173 quotes = {} 0174 for q in ('b_quote', 'quote', 'e_quote'): 0175 n = regexp.groupindex[q] - 1 0176 for m in matches: 0177 key = m[n] 0178 if key: 0179 quotes[key] = quotes.get(key, 0) + 1 0180 0181 return reduce(lambda a, b, quotes = quotes: 0182 (quotes[a] > quotes[b]) and a or b, quotes.keys()) 0183 0184 # ------------------------------------------------------------------------------ 0185 def guessDelimiter(input, textQualifier = '"'): 0186 """ 0187 PROTOTYPE: 0188 guessDelimiter(input, textQualifier = '\"') 0189 DESCRIPTION: 0190 Tries to guess the delimiter. 0191 ARGUMENTS: 0192 - input is raw data as string 0193 - textQualifier is a character used to delimit ambiguous data 0194 RETURNS: 0195 single character or None 0196 """ 0197 0198 # Algorithm: the delimiter /should/ occur the same number of times on each 0199 # row. However, due to malformed data, it may not. We don't want an all or 0200 # nothing approach, so we allow for small variations in the number. 0201 # 1) build a table of the frequency of each character on every line. 0202 # 2) build a table of freqencies of this frequency (meta-frequency?), e.g. 0203 # "x occurred 5 times in 10 rows, 6 times in 1000 rows, 7 times in 2 rows" 0204 # 3) use the mode of the meta-frequency to decide what the frequency /should/ 0205 # be for that character 0206 # 4) find out how often the character actually meets that goal 0207 # 5) the character that best meets its goal is the delimiter 0208 # For performance reasons, the data is evaluated in chunks, so it can try 0209 # and evaluate the smallest portion of the data possible, evaluating additional 0210 # chunks as necessary. 0211 0212 if type(input) != type([]): return None 0213 if len(input) < 2: return None 0214 0215 if textQualifier: 0216 # eliminate text inside textQualifiers 0217 regexp = re.compile('%s(.*?)%s' % (textQualifier, textQualifier), re.S) 0218 subCode = compile("regexp.sub('', line)", '', 'eval') 0219 else: 0220 subCode = compile("line", '', 'eval') 0221 0222 ascii = [chr(c) for c in range(127)] # 7-bit ASCII 0223 0224 # build frequency tables 0225 chunkLength = min(10, len(input)) 0226 iteration = 0 0227 charFrequency = {} 0228 modes = {} 0229 delims = {} 0230 start, end = 0, min(chunkLength, len(input)) 0231 while start < len(input): 0232 iteration += 1 0233 for line in input[start:end]: 0234 l = eval(subCode) 0235 for char in ascii: 0236 metafrequency = charFrequency.get(char, {}) 0237 freq = l.strip().count(char) # must count even if frequency is 0 0238 metafrequency[freq] = metafrequency.get(freq, 0) + 1 # value is the mode 0239 charFrequency[char] = metafrequency 0240 0241 for char in charFrequency.keys(): 0242 items = charFrequency[char].items() 0243 if len(items) == 1 and items[0][0] == 0: continue 0244 # get the mode of the frequencies 0245 if len(items) > 1: 0246 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, items) 0247 # adjust the mode - subtract the sum of all other frequencies 0248 items.remove(modes[char]) 0249 modes[char] = (modes[char][0], modes[char][1] 0250 - reduce(lambda a, b: (0, a[1] + b[1]), items)[1]) 0251 else: 0252 modes[char] = items[0] 0253 0254 # build a list of possible delimiters 0255 modeList = modes.items() 0256 total = float(chunkLength * iteration) 0257 consistency = 1.0 # (rows of consistent data) / (number of rows) = 100% 0258 threshold = 0.9 # minimum consistency threshold 0259 while len(delims) == 0 and consistency >= threshold: 0260 for k, v in modeList: 0261 if v[0] > 0 and v[1] > 0: 0262 if (v[1]/total) >= consistency: 0263 delims[k] = v 0264 consistency -= 0.01 0265 0266 if len(delims) == 1: 0267 return delims.keys()[0] 0268 0269 # analyze another chunkLength lines 0270 start = end 0271 end += chunkLength 0272 0273 if not delims: return None 0274 0275 # if there's more than one candidate, look at quoted data for clues. 0276 # while any character may be quoted, any delimiter that occurs as a 0277 # part of the data /must/ be quoted. 0278 if len(delims) > 1 and textQualifier is not None: 0279 regexp = re.compile('%s(.*?)%s' % (textQualifier, textQualifier), re.S) 0280 for line in input: 0281 inQuotes = "".join(regexp.findall(line)) 0282 for d in delims.keys(): 0283 if not d in inQuotes: 0284 del delims[d] 0285 if len(delims) == 1: 0286 return delims.keys()[0] 0287 0288 # if there's *still* more than one, fall back to a 'preferred' list 0289 if len(delims) > 1: 0290 for d in ['\t', ',', ';', ' ', ':']: 0291 if d in delims.keys(): 0292 return d 0293 0294 # finally, just return the first damn character in the list 0295 return delims.keys()[0] 0296 0297 # ------------------------------------------------------------------------------ 0298 def modeOfLengths(input): 0299 """ 0300 PROTOTYPE: 0301 modeOfLengths(input) 0302 DESCRIPTION: 0303 Finds the mode (most frequently occurring value) of the lengths of the lines. 0304 ARGUMENTS: 0305 - input is list of lists of data 0306 RETURNS: 0307 mode as integer 0308 """ 0309 freq = {} 0310 for row in input: 0311 l = len(row) 0312 freq[l] = freq.get(l, 0) + 1 0313 0314 return reduce(lambda a, b, freq = freq: (freq[a] > freq[b]) and a or b, freq.keys()) 0315 0316 # ------------------------------------------------------------------------------ 0317 def guessHeaders(input, columns = 0): 0318 """ 0319 PROTOTYPE: 0320 guessHeaders(input, columns = 0) 0321 DESCRIPTION: 0322 Decides whether row 0 is a header row 0323 ARGUMENTS: 0324 - input is a list of lists of data (as returned by importDSV) 0325 - columns is either the expected number of columns in each row or 0 0326 RETURNS: 0327 - true if data has header row 0328 """ 0329 0330 # Algorithm: creates a dictionary of types of data in each column. If any column 0331 # is of a single type (say, integers), *except* for the first row, then the first 0332 # row is presumed to be labels. If the type can't be determined, it is assumed to 0333 # be a string in which case the length of the string is the determining factor: if 0334 # all of the rows except for the first are the same length, it's a header. 0335 # Finally, a 'vote' is taken at the end for each column, adding or subtracting from 0336 # the likelihood of the first row being a header. 0337 0338 if type(input) != type([]): raise InvalidData, "list expected." 0339 if len(input) < 2: return 0 0340 0341 if not columns: 0342 columns = modeOfLengths(input) 0343 0344 columnTypes = {} 0345 for i in range(columns): columnTypes[i] = None 0346 0347 for row in input[1:]: 0348 if len(row) != columns: 0349 continue # skip rows that have irregular number of columns 0350 for col in columnTypes.keys(): 0351 try: 0352 try: 0353 # is it a built-in type (besides string)? 0354 thisType = type(eval(row[col])) 0355 except OverflowError: 0356 # a long int? 0357 thisType = type(eval(row[col] + 'L')) 0358 thisType = type(0) # treat long ints as int 0359 except: 0360 # fallback to length of string 0361 thisType = len(row[col]) 0362 0363 if thisType != columnTypes[col]: 0364 if columnTypes[col] is None: # add new column type 0365 columnTypes[col] = thisType 0366 else: # type is inconsistent, remove column from consideration 0367 del columnTypes[col] 0368 0369 # finally, compare results against first row and vote on whether it's a header 0370 hasHeader = 0 0371 for col, colType in columnTypes.items(): 0372 if type(colType) == type(0): # it's a length 0373 if len(input[0][col]) != colType: 0374 hasHeader += 1 0375 else: 0376 hasHeader -= 1 0377 else: # attempt typecast 0378 try: 0379 eval("%s(%s)" % (colType.__name__, input[0][col])) 0380 except: 0381 hasHeader += 1 0382 else: 0383 hasHeader -= 1 0384 0385 return hasHeader > 0 0386 0387 # ------------------------------------------------------------------------------ 0388 def organizeIntoLines(input, textQualifier = '"', limit = None): 0389 """ 0390 PROTOTYPE: 0391 organizeIntoLines(input, textQualifier = '\"', limit = None) 0392 DESCRIPTION: 0393 Takes raw data (as from file.read()) and organizes it into lines. 0394 Newlines that occur within text qualifiers are treated as normal 0395 characters, not line delimiters. 0396 ARGUMENTS: 0397 - input is raw data as a string 0398 - textQualifier is a character used to delimit ambiguous data 0399 - limit is a integer specifying the maximum number of lines to organize 0400 RETURNS: 0401 list of strings 0402 """ 0403 0404 # Algorithm: there should be an even number of text qualifiers on every line. 0405 # If there isn't, that means that the newline at the end of the line must occur 0406 # within qualifiers and doesn't really indicate the end of a record. 0407 0408 data = input.split('\n') 0409 line = 0 0410 while 1: 0411 try: 0412 while data[line].count(textQualifier) % 2: # while odd number 0413 data[line] = data[line] + '\n' + data[line + 1] # add the next line 0414 del data[line + 1] # delete the next line 0415 line += 1 0416 if limit and line > limit: 0417 del data[limit:] # kill any lines that weren't processed 0418 break 0419 except: 0420 break 0421 0422 # filter out empty lines 0423 # data = filter(lambda i: "".join(i), data) 0424 data = filter(string.join, data) 0425 return data 0426 0427 # ------------------------------------------------------------------------------ 0428 # some common error handlers to pass to importDSV 0429 # others might do things like log errors to a file. 0430 # oldrow is the unparsed data, newrow is the parsed data 0431 def padRow(oldrow, newrow, columns, maxColumns): 0432 "pads all rows to the same length with empty strings" 0433 difference = maxColumns - len(newrow) 0434 return newrow + ([''] * difference) 0435 0436 def skipRow(oldrow, newrow, columns, maxColumns): 0437 "skips any inconsistent rows" 0438 return None 0439 0440 def useRow(oldrow, newrow, columns, maxColumns): 0441 "returns row unchanged" 0442 return newrow 0443 0444 # ------------------------------------------------------------------------------ 0445 def importDSV(input, delimiter = ',', textQualifier = '"', columns = 0, 0446 updateFunction = None, errorHandler = None): 0447 """ 0448 PROTOTYPE: 0449 importDSV(input, delimiter = ',', textQualifier = '\"', columns = 0, 0450 updateFunction = None, errorHandler = None) 0451 DESCRIPTION: 0452 parses lines of data in CSV format 0453 ARGUMENTS: 0454 - input is a list of strings (built by organizeIntoLines) 0455 - delimiter is the character used to delimit columns 0456 - textQualifier is the character used to delimit ambiguous data 0457 - columns is the expected number of columns in each row or 0 0458 - updateFunction is a callback function called once per record (could be 0459 used for updating progress bars). Its prototype is 0460 updateFunction(percentDone) 0461 - percentDone is an integer between 0 and 100 0462 - errorHandler is a callback invoked whenever a row has an unexpected number 0463 of columns. Its prototype is 0464 errorHandler(oldrow, newrow, columns, maxColumns) 0465 where 0466 - oldrow is the unparsed data 0467 - newrow is the parsed data 0468 - columns is the expected length of a row 0469 - maxColumns is the longest row in the data 0470 RETURNS: 0471 list of lists of data 0472 """ 0473 if type(input) != type([]): 0474 raise InvalidData, "expected list of lists of strings" 0475 if type(delimiter) != type('') or not delimiter: 0476 raise InvalidDelimiter, `delimiter` 0477 0478 ## if textQualifier: 0479 ## # fieldRex=re.compile('(?:(?:[,]|^)"(.*?)"(?=[,]|$))|(?:(?:[,]|^)([^",]*?)(?=[,]|$))') 0480 ## fieldRex = re.compile('(?:(?:[%s]|^)%s(.*?)%s(?=[%s]|$))|(?:(?:[%s]|^)([^%s%s]*?)(?=[%s]|$))' 0481 ## % (delimiter, textQualifier, textQualifier, delimiter, 0482 ## delimiter, textQualifier, delimiter, delimiter), 0483 ## re.S) 0484 ## else: 0485 ## fieldRex = re.compile('(?:[%s]|^)([^%s]*?)(?=[%s]|$)' 0486 ## % (delimiter, delimiter, delimiter), re.S) 0487 0488 percent = 0.0 0489 lineno = 0.0 0490 newdata = [] 0491 maxColumns = 0 0492 0493 ## for line in input: 0494 ## line = line.strip() 0495 ## record = fieldRex.findall(line) 0496 ## print record 0497 ## if textQualifier: 0498 ## record = [(i[0] or i[1]) for i in record] 0499 0500 ## if textQualifier: 0501 ## record = [c.replace(textQualifier * 2, textQualifier) for c in record] 0502 ## newdata.append(record) 0503 0504 # This code was submitted by Nigel to replace the code commented out above. 0505 # It addresses several issues with embedded quotes and delimiters. It seems that 0506 # while off to a good start, regular expressions won't be able to handle certain 0507 # situations. i.e. '''"Say ""hello"", World", ""''' would seem to be a problem as 0508 # an embedded delimiter follows an embedded quote which throws off the re search. 0509 0510 for line in input: 0511 if textQualifier: 0512 record = [] 0513 inquotes = 0 0514 for s in line.split(delimiter): 0515 odd = s.count(textQualifier) % 2 0516 if inquotes: 0517 accu += delimiter + s.replace(textQualifier * 2, delimiter).\ 0518 replace(textQualifier, '').replace(delimiter, textQualifier) 0519 if odd: 0520 record.append(accu) 0521 inquotes = 0 0522 else: 0523 # 1.3.6 bugfix: deal with case where s = "" to denote an empty string 0524 if s.count(textQualifier): # discard whitespace outside of textQualifiers when they are used 0525 s = s.strip() 0526 # fix new problem with "" 0527 if s == textQualifier * 2: 0528 s = "" 0529 0530 accu = s.replace(textQualifier * 2, delimiter).\ 0531 replace(textQualifier, '').replace(delimiter, textQualifier) 0532 if odd: 0533 inquotes = 1 0534 else: 0535 record.append(accu) 0536 else: 0537 #record = map(lambda x: x.strip(), line.split(delimiter)) 0538 record = map(string.strip, line.split(delimiter)) 0539 0540 newdata.append(record) 0541 # (end of replacement code) 0542 0543 if updateFunction is not None: 0544 lineno = lineno + 1.0 0545 newpercent = int((lineno / len(input)) * 100) 0546 if percent != newpercent: 0547 percent = newpercent 0548 if not updateFunction(percent): 0549 return None 0550 0551 if not columns: 0552 columns = modeOfLengths(newdata) 0553 maxColumns = max([len(line) for line in newdata]) 0554 0555 # consistency check 0556 for record in xrange(len(newdata)): 0557 length = len(newdata[record]) 0558 difference = length - columns 0559 if difference: 0560 if errorHandler is None: 0561 raise InvalidNumberOfColumns, "Expected %d, got %d" % (columns, length) 0562 else: 0563 newdata[record] = errorHandler(input[record], newdata[record], columns, maxColumns) 0564 0565 # remove null values from data 0566 # newdata = filter(lambda i: i, newdata) 0567 newdata = filter(None, newdata) 0568 0569 return newdata 0570 0571 0572 # ------------------------------------------------------------------------------ 0573 def exportDSV(input, delimiter = ',', textQualifier = '"', quoteall = 0): 0574 """ 0575 PROTOTYPE: 0576 exportDSV(input, delimiter = ',', textQualifier = '\"', quoteall = 0) 0577 DESCRIPTION: 0578 Exports to DSV (delimiter-separated values) format. 0579 ARGUMENTS: 0580 - input is list of lists of data (as returned by importDSV) 0581 - delimiter is character used to delimit columns 0582 - textQualifier is character used to delimit ambiguous data 0583 - quoteall is boolean specifying whether to quote all data or only data 0584 that requires it 0585 RETURNS: 0586 data as string 0587 """ 0588 if not delimiter or type(delimiter) != type(''): raise InvalidDelimiter 0589 if not textQualifier or type(delimiter) != type(''): raise InvalidTextQualifier 0590 0591 # double-up all text qualifiers in data (i.e. can't becomes can''t) 0592 data = map(lambda i, q = textQualifier: 0593 map(lambda j, q = q: str(j).replace(q, q * 2), i), 0594 input) 0595 0596 if quoteall: # quote every data value 0597 data = map(lambda i, q = textQualifier: 0598 map(lambda j, q = q: q + j + q, i), 0599 data) 0600 else: # quote only the values that contain qualifiers, delimiters or newlines 0601 data = map(lambda i, q = textQualifier, d = delimiter: 0602 map(lambda j, q = q, d = d: ((j.find(q) != -1 or j.find(d) != -1 0603 or j.find('\n') != -1) 0604 and (q + j + q)) or j, i), data) 0605 # assemble each line with delimiters 0606 data = [delimiter.join(line) for line in data] 0607 0608 # assemble all lines together, separated by newlines 0609 data = "\n".join(data) 0610 return data 0611 0612 if wx is not None: 0613 # ------------------------------------------------------------------------------ 0614 class ImportWizardPanel_Delimiters(wx.Panel): 0615 """ 0616 CLASS(SUPERCLASS): 0617 ImportWizardPanel_Delimiters(wx.Panel) 0618 DESCRIPTION: 0619 A wx.Panel that provides a basic interface for validating and changing the 0620 parameters for importing a delimited text file. Similar to MS Excel's 0621 CSV import wizard. Can be used in a series of wizards or embedded in an 0622 application. 0623 PROTOTYPE: 0624 ImportWizardPanel_Delimiters(parent, id, file, data, isValidCallback = None, 0625 pos = wx.DefaultPosition, size = wx.DefaultSize, 0626 style = wx.TAB_TRAVERSAL, name = 'ImportWizardPanel') 0627 ARGUMENTS: 0628 - parent is the parent window 0629 - id is the id of this wizard panel 0630 - file is the name of the file being imported 0631 - data is the raw data to be parsed 0632 - isValidCallback is a callback function that accepts a single boolean argument 0633 If the argument is true, the wizard is in a valid state (all the settings are 0634 acceptable), if the argument is false, trying to import will likely cause an 0635 exception. 0636 METHODS: 0637 - GetDelimiters() 0638 returns list of characters used as delimiters 0639 - GetTextQualifiers() 0640 returns character used as text qualifier or None 0641 - GetHasHeaders() 0642 returns true if first row is header 0643 """ 0644 0645 def __init__(self, parent, id, file, data, isValidCallback = None, 0646 pos = wx.DefaultPosition, size = wx.DefaultSize, 0647 style = wx.TAB_TRAVERSAL, name = "ImportWizardPanel"): 0648 wx.Panel.__init__(self, parent, id, pos, size, style, name) 0649 self.SetAutoLayout(True) 0650 mainSizer = wx.FlexGridSizer(3, 1) 0651 self.SetSizer(mainSizer) 0652 mainSizer.AddGrowableCol(0) 0653 0654 self.initialized = False 0655 self.data = data 0656 self.isValidCallback = isValidCallback 0657 self.Validate = (isValidCallback and self.Validate) or self.BuildPreview 0658 0659 dlg = wx.ProgressDialog("Import Wizard", 0660 "Analyzing %s... Please wait." % file, 0661 3, 0662 parent, 0663 wx.PD_APP_MODAL | wx.PD_AUTO_HIDE) 0664 textQualifier = guessTextQualifier(data) 0665 dlg.Update(1) 0666 newdata = organizeIntoLines(data, textQualifier = textQualifier, limit = 100) 0667 dlg.Update(2) 0668 delimiter = guessDelimiter(newdata, textQualifier = textQualifier) 0669 dlg.Update(3) 0670 dlg.Destroy() 0671 0672 # ------------- 0673 msg = ("This screen lets you set the delimiters your data contains.\n" 0674 "You can see how your data is affected in the preview below.") 0675 message1 = wx.StaticText(self, -1, msg) 0676 0677 # ------------- 0678 delimiterBox = wx.BoxSizer(wx.HORIZONTAL) 0679 delimStaticBox = wx.StaticBox(self, -1, "Delimiters") 0680 delimStaticSizer = wx.StaticBoxSizer(delimStaticBox, wx.VERTICAL) 0681 delimGridSizer = wx.FlexGridSizer(2, 3) 0682 0683 delims = { 0684 'Tab': '\t', 0685 'Semicolon': ';', 0686 'Comma': ',', 0687 'Space': ' ', 0688 } 0689 0690 self.delimChecks = {} 0691 0692 for label, value in delims.items(): 0693 self.delimChecks[value] = wx.CheckBox(self, -1, label) 0694 delimGridSizer.Add(self.delimChecks[value], 0, wx.ALL, 3) 0695 wx.EVT_CHECKBOX(self, self.delimChecks[value].GetId(), self.Validate) 0696 0697 otherSizer = wx.BoxSizer(wx.HORIZONTAL) 0698 0699 self.delimChecks['Other'] = wx.CheckBox(self, -1, 'Other:') 0700 wx.EVT_CHECKBOX(self, self.delimChecks['Other'].GetId(), self.Validate) 0701 0702 self.otherDelim = wx.TextCtrl(self, -1, size = (20, -1)) 0703 wx.EVT_TEXT(self, self.otherDelim.GetId(), self.OnCustomDelim) 0704 0705 if self.delimChecks.has_key(delimiter): 0706 self.delimChecks[delimiter].SetValue(True) 0707 elif delimiter is not None: 0708 self.delimChecks['Other'].SetValue(True) 0709 self.otherDelim.SetValue(delimiter) 0710 0711 otherSizer.AddMany([ 0712 (self.delimChecks['Other'], 0, wx.ALL, 3), 0713 (self.otherDelim, 0, wx.ALIGN_CENTER), 0714 ]) 0715 0716 delimGridSizer.Add(otherSizer) 0717 delimStaticSizer.Add(delimGridSizer, 1, wx.EXPAND) 0718 delimOtherSizer = wx.BoxSizer(wx.VERTICAL) 0719 self.consecutiveDelimsAs1 = wx.CheckBox(self, -1, "Treat consecutive delimiters as one") 0720 self.consecutiveDelimsAs1.Enable(False) 0721 tqSizer = wx.BoxSizer(wx.HORIZONTAL) 0722 self.textQualifierChoice = wx.Choice(self, -1, choices = ['"', "'", "{None}"]) 0723 wx.EVT_CHOICE(self, self.textQualifierChoice.GetId(), self.BuildPreview) 0724 if textQualifier is not None: 0725 self.textQualifierChoice.SetStringSelection(textQualifier) 0726 else: 0727 self.textQualifierChoice.SetStringSelection('{None}') 0728 0729 tqSizer.AddMany([ 0730 (wx.StaticText(self, -1, "Text qualifier:"), 0, wx.ALIGN_RIGHT | wx.ALIGN_CENTER_VERTICAL), 0731 (self.textQualifierChoice, 0, wx.ALL | wx.ALIGN_LEFT | wx.ALIGN_CENTER_VERTICAL, 5), 0732 ]) 0733 0734 delimOtherSizer.AddMany([ 0735 (self.consecutiveDelimsAs1, 1, wx.EXPAND | wx.ALL, 5), 0736 (tqSizer, 1, wx.ALL | wx.ALIGN_CENTER, 5), 0737 ]) 0738 0739 delimiterBox.AddMany([ 0740 (delimStaticSizer, 0, wx.ALIGN_CENTER), 0741 (delimOtherSizer, 0, wx.ALIGN_CENTER), 0742 ]) 0743 0744 delimStaticBox.Fit() 0745 0746 # ------------- 0747 self.displayRows = 6 0748 previewSettingsBox = wx.BoxSizer(wx.HORIZONTAL) 0749 self.hasHeaderRow = wx.CheckBox(self, -1, "First row is header") 0750 wx.EVT_CHECKBOX(self, self.hasHeaderRow.GetId(), self.BuildPreview) 0751 0752 if wx.Platform in ('__WX.WXGTK__', '__WX.WXMSW__'): 0753 # wx.SpinCtrl causes seg fault under GTK when <enter> is hit in text - use wx.SpinButton instead 0754 self.previewRowsText = wx.TextCtrl(self, -1, str(self.displayRows), 0755 size = (30, -1), style = wx.TE_PROCESS_ENTER) 0756 h = self.previewRowsText.GetSize().height 0757 self.previewRows = wx.SpinButton(self, -1, size = (-1, h), style = wx.SP_VERTICAL) 0758 self.previewRows.SetRange(self.displayRows, 100) 0759 self.previewRows.SetValue(self.displayRows) 0760 wx.EVT_SPIN(self, self.previewRows.GetId(), self.OnSpinPreviewRows) 0761 wx.EVT_TEXT_ENTER(self, self.previewRowsText.GetId(), self.OnTextPreviewRows) 0762 else: 0763 self.previewRows = wx.SpinCtrl(self, -1, str(self.displayRows), 0764 min = self.displayRows, max = 100, size = (50, -1)) 0765 wx.EVT_SPINCTRL(self, self.previewRows.GetId(), self.BuildPreview) 0766 0767 previewSettingsBox.AddMany([ 0768 (self.hasHeaderRow, 1, wx.ALL | wx.EXPAND, 5), 0769 (wx.StaticText(self, -1, "Preview"), 0, wx.WEST | wx.ALIGN_RIGHT | wx.ALIGN_CENTER_VERTICAL, 10), 0770 ]) 0771 if wx.Platform in ('__WX.WXGTK__', '__WX.WXMSW__'): 0772 previewSettingsBox.Add(self.previewRowsText, 0, wx.ALIGN_CENTER | wx.ALL, 3) 0773 previewSettingsBox.AddMany([ 0774 (self.previewRows, 0, wx.ALIGN_CENTER | wx.ALL, 3), 0775 (wx.StaticText(self, -1, "rows"), 0, wx.ALIGN_RIGHT | wx.ALIGN_CENTER_VERTICAL), 0776 ]) 0777 0778 # ------------- 0779 if delimiter is not None: 0780 previewData = importDSV(newdata[:self.displayRows], 0781 textQualifier = textQualifier, 0782 delimiter = delimiter, 0783 errorHandler = padRow) 0784 hasHeaders = guessHeaders(previewData) 0785 self.hasHeaderRow.SetValue(hasHeaders) 0786 0787 cols = len(previewData[0]) 0788 else: 0789 previewData = [] 0790 hasHeaders = 0 0791 cols = 1 0792 0793 previewStaticBox = wx.StaticBox(self, -1, "Data Preview") 0794 previewStaticSizer = wx.StaticBoxSizer(previewStaticBox, wx.VERTICAL) 0795 self.preview = gridlib.Grid(self, -1) 0796 self.preview.CreateGrid(self.displayRows, cols) 0797 self.preview.SetDefaultRowSize(self.preview.GetCharHeight() + 4, True) 0798 self.preview.EnableEditing(False) 0799 self.preview.SetColLabelSize(0) 0800 self.preview.SetRowLabelSize(0) 0801 self.preview.SetMargins(1, 0) 0802 self.initialized = True 0803 self.BuildPreview() 0804 0805 rowheight = self.preview.GetRowSize(0) + 2 0806 self.preview.SetSize((-1, rowheight * self.displayRows)) 0807 previewStaticSizer.Add(self.preview, 0, wx.ALL | wx.EXPAND, 5) 0808 0809 # ------------- 0810 mainSizer.AddMany([ 0811 (message1, 0, wx.ALL, 5), 0812 (delimiterBox, 0, wx.ALL, 5), 0813 (previewSettingsBox, 0, wx.ALL, 5), 0814 (previewStaticSizer, 0, wx.ALL | wx.EXPAND, 5), 0815 ]) 0816 0817 self.Layout() 0818 self.Fit() 0819 0820 def OnSpinPreviewRows(self, event): 0821 self.previewRowsText.SetValue(str(event.GetPosition())) 0822 self.BuildPreview() 0823 0824 def OnTextPreviewRows(self, event): 0825 try: v = int(self.previewRowsText.GetValue()) 0826 except: v = self.displayRows 0827 v = max(self.displayRows, v) 0828 v = min(v, 100) 0829 self.previewRowsText.SetValue(str(v)) 0830 self.previewRows.SetValue(v) 0831 self.BuildPreview() 0832 0833 def Validate(self, event = None): 0834 hasDelimiter = reduce(lambda a, b: a + b, [cb.GetValue() for cb in self.delimChecks.values()]) 0835 if hasDelimiter == 1 and self.delimChecks['Other'].GetValue(): 0836 hasDelimiter = self.otherDelim.GetValue() != "" 0837 self.BuildPreview() 0838 self.isValidCallback(hasDelimiter) 0839 0840 def BuildPreview(self, event = None): 0841 if not self.initialized: 0842 return # got triggered before initialization was completed 0843 0844 if wx.Platform != '__WX.WXGTK__': 0845 wx.BeginBusyCursor() # causes a spurious spin event under GTK 0846 wx.Yield() # allow controls to update first, in case of slow preview 0847 self.preview.BeginBatch() 0848 self.preview.DeleteCols(0, self.preview.GetNumberCols()) 0849 self.preview.DeleteRows(0, self.preview.GetNumberRows()) 0850 self.preview.ClearGrid() 0851 0852 textQualifier = self.textQualifierChoice.GetStringSelection() 0853 if textQualifier == '{None}': textQualifier = None 0854 other = self.otherDelim.GetValue() 0855 delimiter = map(lambda i, other = other: i[0] != 'Other' and i[0] or other, 0856 filter(lambda i: i[1].GetValue(), self.delimChecks.items())) 0857 delimiter = "".join(delimiter) 0858 0859 rows = self.previewRows.GetValue() 0860 0861 newdata = organizeIntoLines(self.data, textQualifier, limit = rows) 0862 try: 0863 previewData = importDSV(newdata[:rows], 0864 textQualifier = textQualifier, 0865 delimiter = delimiter, 0866 errorHandler = padRow) 0867 except InvalidDelimiter, e: 0868 previewData = map(lambda i: [i], newdata[:rows]) 0869 0870 rows = min(rows, len(previewData)) 0871 hasHeaders = self.hasHeaderRow.GetValue() 0872 self.preview.AppendRows(rows - hasHeaders) 0873 cols = max([len(row) for row in previewData]) 0874 self.preview.AppendCols(cols) 0875 0876 if hasHeaders: 0877 self.preview.SetColLabelSize(self.preview.GetRowSize(0)) 0878 for col in range(cols): 0879 try: self.preview.SetColLabelValue(col, str(previewData[0][col])) 0880 except: self.preview.SetColLabelValue(col, "") 0881 # self.preview.AutoSizeColumns(True) # size columns to headers 0882 else: 0883 self.preview.SetColLabelSize(0) 0884 0885 for row in range(hasHeaders, rows): 0886 for col in range(cols): 0887 try: self.preview.SetCellValue(row - hasHeaders, col, str(previewData[row][col])) 0888 except: pass 0889 0890 # if not hasHeaders: 0891 self.preview.AutoSizeColumns(True) # size columns to data 0892 0893 rowheight = self.preview.GetRowSize(0) 0894 self.preview.SetRowSize(0, rowheight) 0895 self.preview.EndBatch() 0896 if wx.Platform != '__WX.WXGTK__': 0897 wx.EndBusyCursor() 0898 0899 self.delimiters = delimiter 0900 self.textQualifier = textQualifier 0901 self.hasHeaders = hasHeaders 0902 0903 def OnCustomDelim(self, event = None): 0904 self.delimChecks['Other'].SetValue(len(self.otherDelim.GetValue())) 0905 self.Validate() 0906 0907 def GetDelimiters(self): 0908 return self.delimiters 0909 0910 def GetTextQualifier(self): 0911 return self.textQualifier 0912 0913 def GetHasHeaders(self): 0914 return self.hasHeaders 0915 0916 # ------------------------------------------------------------------------------ 0917 class ImportWizardDialog(wx.Dialog): 0918 """ 0919 CLASS(SUPERCLASS): 0920 ImportWizardDialog(wx.Dialog) 0921 DESCRIPTION: 0922 A dialog allowing the user to preview and change the options for importing 0923 a file. 0924 PROTOTYPE: 0925 ImportWizardDialog(parent, id, title, file, 0926 pos = wx.DefaultPosition, size = wx.DefaultSize, 0927 style = wx.DEFAULT_DIALOG_STYLE, name = 'ImportWizardDialog') 0928 ARGUMENTS: 0929 - parent: the parent window 0930 - id: the id of this window 0931 - title: the title of this dialog 0932 - file: the file to import 0933 METHODS: 0934 - GetImportInfo() 0935 returns a tuple (delimiters, text qualifiers, has headers) 0936 - ImportData(errorHandler = skipRow) 0937 returns (headers, data), headers may be None 0938 errorHandler is a callback function that instructs the method on what 0939 to do with irregular rows. The default skipRow function simply discards 0940 the bad row (see importDSV() above). 0941 """ 0942 0943 def __init__(self, parent, id, title, file, 0944 pos = wx.DefaultPosition, size = wx.DefaultSize, 0945 style = wx.DEFAULT_DIALOG_STYLE, name = "ImportWizardDialog"): 0946 wx.Dialog.__init__(self, parent, id, title, pos, size, style, name) 0947 self.SetAutoLayout(True) 0948 0949 self.file = file 0950 f = open(file, 'r') 0951 self.data = f.read() 0952 f.close() 0953 0954 sizer = wx.BoxSizer(wx.VERTICAL) 0955 self.delimPanel = ImportWizardPanel_Delimiters(self, -1, file, self.data, self.ValidState) 0956 buttonBox = self.ButtonBox() 0957 sizer.AddMany([ 0958 (self.delimPanel, 0, wx.ALL, 5), 0959 (buttonBox, 0, wx.SOUTH | wx.ALIGN_CENTER_HORIZONTAL | wx.ALIGN_TOP, 0), 0960 ]) 0961 0962 self.SetSizer(sizer) 0963 self.Layout() 0964 sizer.Fit(self.delimPanel) 0965 self.Fit() 0966 self.Centre() 0967 0968 def ButtonBox(self): 0969 panel = wx.Panel(self, -1) 0970 panel.SetAutoLayout(True) 0971 sizer = wx.BoxSizer(wx.HORIZONTAL) 0972 panel.SetSizer(sizer) 0973 self.ok = wx.Button(panel, wx.ID_OK, "Ok") 0974 cancel = wx.Button(panel, wx.ID_CANCEL, "Cancel") 0975 sizer.AddMany([ 0976 (self.ok, 0, wx.ALIGN_TOP | wx.EAST | wx.SOUTH, 10), 0977 (cancel, 0, wx.ALIGN_TOP | wx.WEST | wx.SOUTH, 10), 0978 ]) 0979 panel.Layout() 0980 panel.Fit() 0981 return panel 0982 0983 def GetImportInfo(self): 0984 return (self.delimPanel.GetDelimiters(), 0985 self.delimPanel.GetTextQualifier(), 0986 self.delimPanel.GetHasHeaders()) 0987 0988 def ImportData(self, errorHandler = skipRow): 0989 delimiters, qualifier, hasHeaders = self.GetImportInfo() 0990 self.data = organizeIntoLines(self.data, textQualifier = qualifier) 0991 dlg = wx.ProgressDialog("Import DSV File", 0992 self.file, 0993 100, 0994 self, 0995 wx.PD_CAN_ABORT | wx.PD_APP_MODAL | wx.PD_AUTO_HIDE) 0996 self.data = importDSV(self.data, 0997 delimiter = delimiters, 0998 textQualifier = qualifier, 0999 updateFunction = dlg.Update, 1000 errorHandler = errorHandler) 1001 if self.data is None: return None 1002 if hasHeaders: 1003 headers = copy.copy(self.data[0]) 1004 del self.data[0] 1005 else: 1006 headers = None 1007 return (headers, self.data) 1008 1009 def ValidState(self, isValid): 1010 self.ok.Enable(isValid) 1011 1012 1013 # ------------------------------------------------------------------------------ 1014 if __name__ == '__main__': 1015 if wx is None: 1016 print "\nYou need wxPython to run this sample*." 1017 print "\n*Note that wxPython is _not_ necessary to use this module, but it is required" 1018 print "to use the wizard dialog (which the sample requires)." 1019 raise SystemExit 1020 1021 1022 def demo(): 1023 class SampleApp(wx.App): 1024 def OnInit(self): 1025 dlg = wx.FileDialog(None, "Choose a file", ".", "", 1026 "CSV files (*.csv)|*.csv|Text files (*.txt)|*.txt|All files (*.*)|*.*", 1027 wx.OPEN) 1028 if dlg.ShowModal() == wx.ID_OK: 1029 path = dlg.GetPath() 1030 dlg.Destroy() 1031 1032 errorLog = open('import_error.log', 'a+') 1033 def logErrors(oldrow, newrow, expectedColumns, maxColumns, file = errorLog): 1034 # log the bad row to a file 1035 file.write(oldrow + '\n') 1036 1037 dlg = ImportWizardDialog(None, -1, 'CSV Import Wizard (v.%s)' % __version__, path) 1038 if dlg.ShowModal() == wx.ID_OK: 1039 results = dlg.ImportData(errorHandler = logErrors) 1040 dlg.Destroy() 1041 errorLog.close() 1042 1043 if results is not None: 1044 headers, data = results 1045 if True: # print the output to stdout 1046 if headers: 1047 print headers 1048 print 80*'=' 1049 for row in data: 1050 print row 1051 1052 if True: # for testing export functionality 1053 if headers: 1054 print exportDSV([headers] + data) 1055 else: 1056 print exportDSV(data) 1057 else: 1058 dlg.Destroy() 1059 1060 else: 1061 dlg.Destroy() 1062 1063 return True 1064 1065 app = SampleApp() 1066 app.MainLoop() 1067 1068 # import profile 1069 # profile.run('demo()') 1070 demo() 1071
Generated by PyXR 0.9.4