PyXR

c:\projects\bitpim\src \ thirdparty \ DSV.py


0001 """
0002 DSV.py - Cliff Wells, 2002
0003   Import/export DSV (delimiter separated values, a generalization of CSV).
0004 
0005 $Id: DSV.py 3878 2007-01-09 22:28:37Z djpham $
0006 Modified by Joe Pham <djpham@bitpim.org> to accommodate wxPython 2.8+
0007 
0008 Basic use:
0009 
0010    from DSV import DSV
0011 
0012    data = file.read()
0013    qualifier = DSV.guessTextQualifier(data) # optional
0014    data = DSV.organizeIntoLines(data, textQualifier = qualifier)
0015    delimiter = DSV.guessDelimiter(data) # optional
0016    data = DSV.importDSV(data, delimiter = delimiter, textQualifier = qualifier)
0017    hasHeader = DSV.guessHeaders(data) # optional
0018 
0019 If you know the delimiters, qualifiers, etc, you may skip the optional
0020 'guessing' steps as they rely on heuristics anyway (although they seem
0021 to work well, there is no guarantee they are correct). What they are
0022 best used for is to make a good guess regarding the data structure and then
0023 let the user confirm it.
0024 
0025 As such there is a 'wizard' to aid in this process (use this in lieu of
0026 the above code - requires wxPython):
0027 
0028    from DSV import DSV
0029 
0030    dlg = DSV.ImportWizardDialog(parent, -1, 'DSV Import Wizard', filename)
0031    dlg.ShowModal()
0032    headers, data = dlg.ImportData() # may also return None
0033    dlg.Destroy()
0034 
0035 The dlg.ImportData() method may also take a function as an optional argument
0036 specifying what it should do about malformed rows.  See the example at the bottom
0037 of this file. A few common functions are provided in this file (padRow, skipRow,
0038 useRow).
0039 
0040 Requires Python 2.0 or later
0041 Wizards tested with wxPython 2.2.5/NT 4.0, 2.3.2/Win2000 and Linux/GTK (RedHat 7.x)
0042 """
0043 
0044 __version__ = '1.4'
0045 
0046 """
0047 Bugs/Caveats:
0048    - Although I've tested this stuff on varied data, I'm sure there are cases
0049      that I haven't seen that will choke any one of these routines (or at least
0050      return invalid data). This is beta code!
0051    - guessTextQualifier() algorithm is limited to quotes (double or single).
0052    - Surprising feature: Hitting <enter> on wxSpinCtrl causes seg
0053      fault under Linux/GTK (not Win32). Strangely, pressing <tab> seems ok.
0054      Therefore, I had to use wxSpinButton.  Also, spurious spin events get
0055      generated for both of these controls (e.g. when calling wxBeginBusyCursor)
0056    - Keyboard navigation needs to be implemented on wizards
0057    - There may be issues with cr/lf translation, although I haven't yet seen any.
0058    
0059 Why another CSV tool?:
0060    - Because I needed a more flexible CSV importer, one that could accept different
0061      delimiters (not just commas or tabs), one that could make an intelligent guess
0062      regarding file structure (for user convenience), be compatible with the files
0063      output by MS Excel, and finally, be easily integrated with a wizard.  All of the
0064      modules I have seen prior to this fell short on one count or another.
0065    - It seemed interesting.
0066      
0067 To do:
0068    - Better guessTextQualifier() algorithm. In the perfect world I envision, I can
0069      use any character as a text qualifier, not just quotes.
0070    - Finish wizards and move them into separate module.
0071    - Better guessHeaders() algorithm, although this is difficult.
0072    - Optimize maps() - try to eliminate lambda when possible
0073    - Optimize memory usage.  Presently the entire file is loaded and then saved as
0074      a list.  A better approach might be to analyze a smaller part of the file and
0075      then return an iterator to step through it.
0076 """
0077 
0078 # Changelog
0079 # 1.4:
0080 #   - Fixed small bug in demo (forgotten Destroy()) that caused a hang
0081 #     when Cancel was pressed.
0082 #   - Removed extraneous guessHeaders() call in wizard.  I can only say,
0083 #     "what the??" and remember to profile.  This was a huge time waster.
0084 #
0085 # 1.3.9
0086 #   - Fixed real problem on Win32 in that wxProgressDialog must reach max
0087 #     value in order to close.
0088 #
0089 # 1.3.8
0090 #   - Change demo to use wxApp rather than wxPySimpleApp as it seemed
0091 #     to have problems on Win32 (per Kevin Altis)
0092 #
0093 # 1.37
0094 #   - Fix for font issue under GTK2 (thanks to Ahmad Baitalmal)
0095 #   - Added some space below the Ok/Cancel buttons.
0096 #
0097 # 1.36
0098 #   - Bugfix submitted by "nobody" ;) on SF
0099 #
0100 # 1.3.4 to 1.3.5:
0101 #   - Nigel Hathaway finds yet another bug (or two).  Can't seem to make him
0102 #     use something else, so they had to be fixed.  It's especially difficult
0103 #     to ignore him since he provided the fix.  Very annoying.
0104 #     - Problem with odd quote/delimiter combinations (SF bug #620284)
0105 #     - Losing empty fields at beginning/end (#619771)
0106 #     - Whitespace stripped from around string (#620115)
0107 #
0108 # 1.3.3 to 1.3.4(a):
0109 #   - Fixed bug in exportDSV that failed to quote data containing delimiter
0110 #     thanks to nhathaway@users.sourceforge.net
0111 #
0112 # 1.3 to 1.3.1:
0113 #   - Test for presence of wxPython (since it's not required except for wizard)
0114 #   - Changed "from wxPython.wx import *" to "from wxPython import wx"
0115 #   - Changed sample csv file (darkwave.csv) to demonstrate embedded quotes
0116 
0117 import sys
0118 # import pre as re # sre was broken, appears okay now. Try this if there are problems.
0119 import re 
0120 import copy
0121 import exceptions
0122 import string
0123 # RedHat 8.0 (or rather GTK2?) sets LANG = en_us.UTF-8 and apparently some
0124 # older apps (including wxGTK) can't handle this.  The fix is to set LANG=C
0125 # before running the app.  Thanks to Ahmad Baitalmal for supplying this info.
0126 import os
0127 os.putenv('LANG', 'C')
0128 
0129 ##try:
0130 ##    from wxPython import wx, grid
0131 ##except ImportError:
0132 ##    wx = None
0133 
0134 try:
0135     import wx
0136     import wx.grid as gridlib
0137 except ImportError:
0138     wx = None
0139 
0140 class InvalidDelimiter(exceptions.StandardError): pass
0141 class InvalidTextQualifier(exceptions.StandardError): pass
0142 class InvalidData(exceptions.StandardError): pass
0143 class InvalidNumberOfColumns(exceptions.StandardError): pass
0144 
0145 # ------------------------------------------------------------------------------
0146 def guessTextQualifier(input):
0147     """
0148     PROTOTYPE:
0149       guessTextQualifier(input)
0150     DESCRIPTION:
0151       tries to guess if the text qualifier (a character delimiting ambiguous data)
0152       is a single or double-quote (or None)
0153     ARGUMENTS:
0154       - input is raw data as a string
0155     RETURNS:
0156       single character or None
0157     """
0158     
0159     # Algorithm: looks for text enclosed between two identical quotes (the probable
0160     # qualifier) which are preceded and followed by the same character (the
0161     # probable delimiter), for example:
0162     #                        ,'some text',
0163     # The quote with the most wins.
0164     
0165     data = input[:16 * 1024] # limit sample to 16k
0166     
0167     regexp = re.compile('(?:(?:^|\n)(?P<b_quote>["\']).*?(?P=b_quote))|'
0168                         '(?:(?P<delim>.)(?P<quote>["\']).*?(?P=quote)(?=(?P=delim)|\n))|'
0169                         '(?:(?P<e_quote>["\']).*?(?P=e_quote)$)', re.M | re.S)
0170     matches = filter(lambda i: reduce(lambda a, b: a + b, i), regexp.findall(data))
0171     if not matches: return None
0172     
0173     quotes = {}
0174     for q in ('b_quote', 'quote', 'e_quote'):
0175         n = regexp.groupindex[q] - 1
0176         for m in matches:
0177             key = m[n]
0178             if key:
0179                 quotes[key] = quotes.get(key, 0) + 1
0180 
0181     return reduce(lambda a, b, quotes = quotes:
0182                   (quotes[a] > quotes[b]) and a or b, quotes.keys())
0183     
0184 # ------------------------------------------------------------------------------
0185 def guessDelimiter(input, textQualifier = '"'):
0186     """
0187     PROTOTYPE:
0188       guessDelimiter(input, textQualifier = '\"')
0189     DESCRIPTION:
0190       Tries to guess the delimiter.
0191     ARGUMENTS:
0192       - input is raw data as string
0193       - textQualifier is a character used to delimit ambiguous data
0194     RETURNS:
0195       single character or None
0196     """
0197     
0198     # Algorithm: the delimiter /should/ occur the same number of times on each
0199     # row. However, due to malformed data, it may not. We don't want an all or
0200     # nothing approach, so we allow for small variations in the number.
0201     # 1) build a table of the frequency of each character on every line.
0202     # 2) build a table of freqencies of this frequency (meta-frequency?), e.g.
0203     #     "x occurred 5 times in 10 rows, 6 times in 1000 rows, 7 times in 2 rows"
0204     # 3) use the mode of the meta-frequency to decide what the frequency /should/
0205     #    be for that character 
0206     # 4) find out how often the character actually meets that goal 
0207     # 5) the character that best meets its goal is the delimiter 
0208     # For performance reasons, the data is evaluated in chunks, so it can try
0209     # and evaluate the smallest portion of the data possible, evaluating additional
0210     # chunks as necessary. 
0211 
0212     if type(input) != type([]): return None
0213     if len(input) < 2: return None
0214 
0215     if textQualifier:
0216         # eliminate text inside textQualifiers
0217         regexp = re.compile('%s(.*?)%s' % (textQualifier, textQualifier), re.S)
0218         subCode = compile("regexp.sub('', line)", '', 'eval')
0219     else:
0220         subCode = compile("line", '', 'eval')
0221 
0222     ascii = [chr(c) for c in range(127)] # 7-bit ASCII
0223 
0224     # build frequency tables
0225     chunkLength = min(10, len(input))
0226     iteration = 0
0227     charFrequency = {}
0228     modes = {}
0229     delims = {}
0230     start, end = 0, min(chunkLength, len(input))
0231     while start < len(input):
0232         iteration += 1
0233         for line in input[start:end]:
0234             l = eval(subCode)
0235             for char in ascii:
0236                 metafrequency = charFrequency.get(char, {})
0237                 freq = l.strip().count(char) # must count even if frequency is 0
0238                 metafrequency[freq] = metafrequency.get(freq, 0) + 1 # value is the mode
0239                 charFrequency[char] = metafrequency
0240 
0241         for char in charFrequency.keys():
0242             items = charFrequency[char].items()
0243             if len(items) == 1 and items[0][0] == 0: continue
0244             # get the mode of the frequencies
0245             if len(items) > 1:
0246                 modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b, items)
0247                 # adjust the mode - subtract the sum of all other frequencies
0248                 items.remove(modes[char])
0249                 modes[char] = (modes[char][0], modes[char][1]
0250                                - reduce(lambda a, b: (0, a[1] + b[1]), items)[1])
0251             else:
0252                 modes[char] = items[0]
0253 
0254         # build a list of possible delimiters
0255         modeList = modes.items()
0256         total = float(chunkLength * iteration)
0257         consistency = 1.0 # (rows of consistent data) / (number of rows) = 100%
0258         threshold = 0.9  # minimum consistency threshold
0259         while len(delims) == 0 and consistency >= threshold:
0260             for k, v in modeList:
0261                 if v[0] > 0 and v[1] > 0:
0262                     if (v[1]/total) >= consistency:
0263                         delims[k] = v
0264             consistency -= 0.01
0265 
0266         if len(delims) == 1:
0267             return delims.keys()[0]
0268 
0269         # analyze another chunkLength lines
0270         start = end
0271         end += chunkLength
0272 
0273     if not delims: return None
0274     
0275     # if there's more than one candidate, look at quoted data for clues.
0276     # while any character may be quoted, any delimiter that occurs as a
0277     # part of the data /must/ be quoted.
0278     if len(delims) > 1 and textQualifier is not None:
0279         regexp = re.compile('%s(.*?)%s' % (textQualifier, textQualifier), re.S)
0280         for line in input:
0281             inQuotes = "".join(regexp.findall(line))
0282             for d in delims.keys():
0283                 if not d in inQuotes:
0284                     del delims[d]
0285                 if len(delims) == 1:
0286                     return delims.keys()[0]
0287     
0288     # if there's *still* more than one, fall back to a 'preferred' list
0289     if len(delims) > 1:
0290         for d in ['\t', ',', ';', ' ', ':']:
0291             if d in delims.keys():
0292                 return d
0293             
0294     # finally, just return the first damn character in the list
0295     return delims.keys()[0]
0296 
0297 # ------------------------------------------------------------------------------
0298 def modeOfLengths(input):
0299     """
0300     PROTOTYPE:
0301       modeOfLengths(input)
0302     DESCRIPTION:
0303       Finds the mode (most frequently occurring value) of the lengths of the lines.
0304     ARGUMENTS:
0305       - input is list of lists of data
0306     RETURNS:
0307       mode as integer
0308     """
0309     freq = {}
0310     for row in input:
0311         l = len(row)
0312         freq[l] = freq.get(l, 0) + 1
0313 
0314     return reduce(lambda a, b, freq = freq: (freq[a] > freq[b]) and a or b, freq.keys())
0315 
0316 # ------------------------------------------------------------------------------
0317 def guessHeaders(input, columns = 0):
0318     """
0319     PROTOTYPE:
0320       guessHeaders(input, columns = 0)
0321     DESCRIPTION:
0322       Decides whether row 0 is a header row
0323     ARGUMENTS:
0324       - input is a list of lists of data (as returned by importDSV)
0325       - columns is either the expected number of columns in each row or 0
0326     RETURNS:
0327       - true if data has header row
0328     """
0329     
0330     # Algorithm: creates a dictionary of types of data in each column. If any column
0331     # is of a single type (say, integers), *except* for the first row, then the first
0332     # row is presumed to be labels. If the type can't be determined, it is assumed to
0333     # be a string in which case the length of the string is the determining factor: if
0334     # all of the rows except for the first are the same length, it's a header.
0335     # Finally, a 'vote' is taken at the end for each column, adding or subtracting from
0336     # the likelihood of the first row being a header. 
0337 
0338     if type(input) != type([]): raise InvalidData, "list expected."
0339     if len(input) < 2: return 0
0340 
0341     if not columns:
0342         columns = modeOfLengths(input)
0343         
0344     columnTypes = {}
0345     for i in range(columns): columnTypes[i] = None
0346     
0347     for row in input[1:]:
0348         if len(row) != columns:
0349             continue # skip rows that have irregular number of columns
0350         for col in columnTypes.keys():
0351             try:
0352                 try:
0353                     # is it a built-in type (besides string)?
0354                     thisType = type(eval(row[col]))
0355                 except OverflowError:
0356                     # a long int?
0357                     thisType = type(eval(row[col] + 'L'))
0358                     thisType = type(0) # treat long ints as int
0359             except:
0360                 # fallback to length of string
0361                 thisType = len(row[col])
0362 
0363             if thisType != columnTypes[col]:
0364                 if columnTypes[col] is None: # add new column type
0365                     columnTypes[col] = thisType
0366                 else: # type is inconsistent, remove column from consideration
0367                     del columnTypes[col]
0368                     
0369     # finally, compare results against first row and vote on whether it's a header
0370     hasHeader = 0
0371     for col, colType in columnTypes.items():
0372         if type(colType) == type(0): # it's a length
0373             if len(input[0][col]) != colType:
0374                 hasHeader += 1
0375             else:
0376                 hasHeader -= 1
0377         else: # attempt typecast
0378             try:
0379                 eval("%s(%s)" % (colType.__name__, input[0][col]))
0380             except:
0381                 hasHeader += 1
0382             else:
0383                 hasHeader -= 1
0384 
0385     return hasHeader > 0
0386 
0387 # ------------------------------------------------------------------------------
0388 def organizeIntoLines(input, textQualifier = '"', limit = None):
0389     """
0390     PROTOTYPE:
0391       organizeIntoLines(input, textQualifier = '\"', limit = None)
0392     DESCRIPTION:
0393       Takes raw data (as from file.read()) and organizes it into lines.
0394       Newlines that occur within text qualifiers are treated as normal
0395       characters, not line delimiters.
0396     ARGUMENTS:
0397       - input is raw data as a string
0398       - textQualifier is a character used to delimit ambiguous data
0399       - limit is a integer specifying the maximum number of lines to organize
0400     RETURNS:
0401       list of strings
0402     """
0403     
0404     # Algorithm: there should be an even number of text qualifiers on every line.
0405     # If there isn't, that means that the newline at the end of the line must occur
0406     # within qualifiers and doesn't really indicate the end of a record.
0407     
0408     data = input.split('\n')
0409     line = 0
0410     while 1:
0411         try:
0412             while data[line].count(textQualifier) % 2: # while odd number
0413                 data[line] = data[line] + '\n' + data[line + 1] # add the next line
0414                 del data[line + 1] # delete the next line
0415             line += 1
0416             if limit and line > limit:
0417                 del data[limit:] # kill any lines that weren't processed
0418                 break
0419         except:
0420             break
0421         
0422     # filter out empty lines
0423     # data = filter(lambda i: "".join(i), data)
0424     data = filter(string.join, data)
0425     return data
0426 
0427 # ------------------------------------------------------------------------------
0428 # some common error handlers to pass to importDSV
0429 # others might do things like log errors to a file.
0430 # oldrow is the unparsed data, newrow is the parsed data
0431 def padRow(oldrow, newrow, columns, maxColumns):
0432     "pads all rows to the same length with empty strings"
0433     difference = maxColumns - len(newrow)
0434     return newrow + ([''] * difference)
0435 
0436 def skipRow(oldrow, newrow, columns, maxColumns):
0437     "skips any inconsistent rows"
0438     return None
0439 
0440 def useRow(oldrow, newrow, columns, maxColumns):
0441     "returns row unchanged"
0442     return newrow
0443 
0444 # ------------------------------------------------------------------------------
0445 def importDSV(input, delimiter = ',', textQualifier = '"', columns = 0,
0446               updateFunction = None, errorHandler = None):
0447     """
0448     PROTOTYPE:
0449       importDSV(input, delimiter = ',', textQualifier = '\"', columns = 0,
0450                 updateFunction = None, errorHandler = None)
0451     DESCRIPTION:
0452       parses lines of data in CSV format
0453     ARGUMENTS:
0454       - input is a list of strings (built by organizeIntoLines)
0455       - delimiter is the character used to delimit columns
0456       - textQualifier is the character used to delimit ambiguous data
0457       - columns is the expected number of columns in each row or 0
0458       - updateFunction is a callback function called once per record (could be
0459         used for updating progress bars). Its prototype is
0460            updateFunction(percentDone)
0461            - percentDone is an integer between 0 and 100
0462       - errorHandler is a callback invoked whenever a row has an unexpected number
0463         of columns. Its prototype is
0464            errorHandler(oldrow, newrow, columns, maxColumns)
0465               where
0466               - oldrow is the unparsed data
0467               - newrow is the parsed data
0468               - columns is the expected length of a row
0469               - maxColumns is the longest row in the data
0470     RETURNS:
0471       list of lists of data
0472     """
0473     if type(input) != type([]):
0474         raise InvalidData, "expected list of lists of strings"  
0475     if type(delimiter) != type('') or not delimiter:
0476         raise InvalidDelimiter, `delimiter`
0477 
0478 ##    if textQualifier:
0479 ##        # fieldRex=re.compile('(?:(?:[,]|^)"(.*?)"(?=[,]|$))|(?:(?:[,]|^)([^",]*?)(?=[,]|$))')
0480 ##        fieldRex = re.compile('(?:(?:[%s]|^)%s(.*?)%s(?=[%s]|$))|(?:(?:[%s]|^)([^%s%s]*?)(?=[%s]|$))'
0481 ##                              % (delimiter, textQualifier, textQualifier, delimiter,
0482 ##                                 delimiter, textQualifier, delimiter, delimiter),
0483 ##                              re.S)
0484 ##    else:
0485 ##        fieldRex = re.compile('(?:[%s]|^)([^%s]*?)(?=[%s]|$)'
0486 ##                              % (delimiter, delimiter, delimiter), re.S)
0487 
0488     percent = 0.0
0489     lineno = 0.0
0490     newdata = []
0491     maxColumns = 0
0492 
0493 ##    for line in input:
0494 ##        line = line.strip()
0495 ##        record = fieldRex.findall(line)
0496 ##        print record
0497 ##        if textQualifier:
0498 ##            record = [(i[0] or i[1]) for i in record]
0499 
0500 ##        if textQualifier:
0501 ##            record = [c.replace(textQualifier * 2, textQualifier) for c in record]
0502 ##        newdata.append(record)
0503 
0504     # This code was submitted by Nigel to replace the code commented out above.
0505     # It addresses several issues with embedded quotes and delimiters.  It seems that
0506     # while off to a good start, regular expressions won't be able to handle certain
0507     # situations. i.e. '''"Say ""hello"", World", ""''' would seem to be a problem as
0508     # an embedded delimiter follows an embedded quote which throws off the re search.
0509 
0510     for line in input:
0511         if textQualifier:
0512             record = []
0513             inquotes = 0
0514             for s in line.split(delimiter):
0515                 odd = s.count(textQualifier) % 2
0516                 if inquotes:
0517                     accu += delimiter + s.replace(textQualifier * 2, delimiter).\
0518                             replace(textQualifier, '').replace(delimiter, textQualifier)
0519                     if odd:
0520                         record.append(accu)
0521                         inquotes = 0
0522                 else:
0523                     # 1.3.6 bugfix: deal with case where s = "" to denote an empty string
0524                     if s.count(textQualifier): # discard whitespace outside of textQualifiers when they are used
0525                         s = s.strip()
0526                     # fix new problem with ""
0527                     if s == textQualifier * 2: 
0528                         s = ""
0529 
0530                     accu = s.replace(textQualifier * 2, delimiter).\
0531                            replace(textQualifier, '').replace(delimiter, textQualifier)
0532                     if odd:
0533                         inquotes = 1
0534                     else:
0535                         record.append(accu)
0536         else:
0537             #record = map(lambda x: x.strip(), line.split(delimiter))
0538             record = map(string.strip, line.split(delimiter))
0539 
0540         newdata.append(record)
0541         # (end of replacement code)
0542         
0543         if updateFunction is not None:
0544             lineno = lineno + 1.0
0545             newpercent = int((lineno / len(input)) * 100)
0546             if percent != newpercent:
0547                 percent = newpercent
0548                 if not updateFunction(percent):
0549                     return None
0550     
0551     if not columns:
0552         columns = modeOfLengths(newdata)
0553     maxColumns = max([len(line) for line in newdata])
0554 
0555     # consistency check
0556     for record in xrange(len(newdata)):
0557         length = len(newdata[record])
0558         difference = length - columns
0559         if difference:
0560             if errorHandler is None:
0561                 raise InvalidNumberOfColumns, "Expected %d, got %d" % (columns, length)
0562             else:
0563                 newdata[record] = errorHandler(input[record], newdata[record], columns, maxColumns)
0564     
0565     # remove null values from data
0566     # newdata = filter(lambda i: i, newdata)
0567     newdata = filter(None, newdata)
0568     
0569     return newdata
0570 
0571 
0572 # ------------------------------------------------------------------------------
0573 def exportDSV(input, delimiter = ',', textQualifier = '"', quoteall = 0):
0574     """
0575     PROTOTYPE:
0576       exportDSV(input, delimiter = ',', textQualifier = '\"', quoteall = 0)
0577     DESCRIPTION:
0578       Exports to DSV (delimiter-separated values) format.
0579     ARGUMENTS:
0580       - input is list of lists of data (as returned by importDSV)
0581       - delimiter is character used to delimit columns
0582       - textQualifier is character used to delimit ambiguous data
0583       - quoteall is boolean specifying whether to quote all data or only data
0584         that requires it
0585     RETURNS:
0586       data as string
0587     """
0588     if not delimiter or type(delimiter) != type(''): raise InvalidDelimiter
0589     if not textQualifier or type(delimiter) != type(''): raise InvalidTextQualifier
0590 
0591     # double-up all text qualifiers in data (i.e. can't becomes can''t)
0592     data = map(lambda i, q = textQualifier:
0593                map(lambda j, q = q: str(j).replace(q, q * 2), i),
0594                input)
0595 
0596     if quoteall: # quote every data value
0597         data = map(lambda i, q = textQualifier:
0598                    map(lambda j, q = q: q + j + q, i),
0599                    data)
0600     else: # quote only the values that contain qualifiers, delimiters or newlines
0601         data = map(lambda i, q = textQualifier, d = delimiter:
0602                    map(lambda j, q = q, d = d: ((j.find(q) != -1 or j.find(d) != -1
0603                                           or j.find('\n') != -1)
0604                                          and (q + j + q)) or j, i), data)
0605     # assemble each line with delimiters
0606     data = [delimiter.join(line) for line in data]
0607     
0608     # assemble all lines together, separated by newlines
0609     data = "\n".join(data)
0610     return data
0611 
0612 if wx is not None:
0613     # ------------------------------------------------------------------------------
0614     class ImportWizardPanel_Delimiters(wx.Panel):
0615         """
0616         CLASS(SUPERCLASS):
0617           ImportWizardPanel_Delimiters(wx.Panel)
0618         DESCRIPTION:
0619           A wx.Panel that provides a basic interface for validating and changing the
0620           parameters for importing a delimited text file. Similar to MS Excel's
0621           CSV import wizard. Can be used in a series of wizards or embedded in an
0622           application.
0623         PROTOTYPE:
0624           ImportWizardPanel_Delimiters(parent, id, file, data, isValidCallback = None,
0625                                        pos = wx.DefaultPosition, size = wx.DefaultSize,
0626                                        style = wx.TAB_TRAVERSAL, name = 'ImportWizardPanel')
0627         ARGUMENTS:
0628           - parent is the parent window
0629           - id is the id of this wizard panel
0630           - file is the name of the file being imported
0631           - data is the raw data to be parsed
0632           - isValidCallback is a callback function that accepts a single boolean argument
0633             If the argument is true, the wizard is in a valid state (all the settings are
0634             acceptable), if the argument is false, trying to import will likely cause an
0635             exception.
0636         METHODS:
0637           - GetDelimiters()
0638             returns list of characters used as delimiters
0639           - GetTextQualifiers()
0640             returns character used as text qualifier or None
0641           - GetHasHeaders()
0642             returns true if first row is header
0643         """
0644 
0645         def __init__(self, parent, id, file, data, isValidCallback = None,
0646                      pos = wx.DefaultPosition, size = wx.DefaultSize,
0647                      style = wx.TAB_TRAVERSAL, name = "ImportWizardPanel"):
0648             wx.Panel.__init__(self, parent, id, pos, size, style, name)
0649             self.SetAutoLayout(True)
0650             mainSizer = wx.FlexGridSizer(3, 1)
0651             self.SetSizer(mainSizer)
0652             mainSizer.AddGrowableCol(0)
0653 
0654             self.initialized = False
0655             self.data = data
0656             self.isValidCallback = isValidCallback
0657             self.Validate = (isValidCallback and self.Validate) or self.BuildPreview
0658 
0659             dlg = wx.ProgressDialog("Import Wizard",
0660                                    "Analyzing %s... Please wait." % file,
0661                                    3,
0662                                    parent,
0663                                    wx.PD_APP_MODAL | wx.PD_AUTO_HIDE)
0664             textQualifier = guessTextQualifier(data)
0665             dlg.Update(1)
0666             newdata = organizeIntoLines(data, textQualifier = textQualifier, limit = 100)
0667             dlg.Update(2)
0668             delimiter = guessDelimiter(newdata, textQualifier = textQualifier)
0669             dlg.Update(3)
0670             dlg.Destroy()
0671 
0672             # -------------
0673             msg = ("This screen lets you set the delimiters your data contains.\n"
0674                    "You can see how your data is affected in the preview below.")
0675             message1 = wx.StaticText(self, -1, msg)
0676 
0677             # -------------
0678             delimiterBox = wx.BoxSizer(wx.HORIZONTAL)
0679             delimStaticBox = wx.StaticBox(self, -1, "Delimiters")
0680             delimStaticSizer = wx.StaticBoxSizer(delimStaticBox, wx.VERTICAL)
0681             delimGridSizer = wx.FlexGridSizer(2, 3)
0682 
0683             delims = {
0684                 'Tab':       '\t',
0685                 'Semicolon': ';',
0686                 'Comma':     ',',
0687                 'Space':     ' ',
0688                 }
0689 
0690             self.delimChecks = {}
0691 
0692             for label, value in delims.items():
0693                 self.delimChecks[value] = wx.CheckBox(self, -1, label)
0694                 delimGridSizer.Add(self.delimChecks[value], 0, wx.ALL, 3)
0695                 wx.EVT_CHECKBOX(self, self.delimChecks[value].GetId(), self.Validate)
0696 
0697             otherSizer = wx.BoxSizer(wx.HORIZONTAL)
0698 
0699             self.delimChecks['Other'] = wx.CheckBox(self, -1, 'Other:')
0700             wx.EVT_CHECKBOX(self, self.delimChecks['Other'].GetId(), self.Validate)
0701 
0702             self.otherDelim = wx.TextCtrl(self, -1, size = (20, -1))
0703             wx.EVT_TEXT(self, self.otherDelim.GetId(), self.OnCustomDelim)
0704 
0705             if self.delimChecks.has_key(delimiter):
0706                 self.delimChecks[delimiter].SetValue(True)
0707             elif delimiter is not None:
0708                 self.delimChecks['Other'].SetValue(True)
0709                 self.otherDelim.SetValue(delimiter)
0710 
0711             otherSizer.AddMany([
0712                 (self.delimChecks['Other'], 0, wx.ALL, 3),
0713                 (self.otherDelim, 0, wx.ALIGN_CENTER),
0714                 ])
0715             
0716             delimGridSizer.Add(otherSizer)
0717             delimStaticSizer.Add(delimGridSizer, 1, wx.EXPAND)
0718             delimOtherSizer = wx.BoxSizer(wx.VERTICAL)
0719             self.consecutiveDelimsAs1 = wx.CheckBox(self, -1, "Treat consecutive delimiters as one")
0720             self.consecutiveDelimsAs1.Enable(False)
0721             tqSizer = wx.BoxSizer(wx.HORIZONTAL)
0722             self.textQualifierChoice = wx.Choice(self, -1, choices = ['"', "'", "{None}"])
0723             wx.EVT_CHOICE(self, self.textQualifierChoice.GetId(), self.BuildPreview)
0724             if textQualifier is not None:
0725                 self.textQualifierChoice.SetStringSelection(textQualifier)
0726             else:
0727                 self.textQualifierChoice.SetStringSelection('{None}')
0728                 
0729             tqSizer.AddMany([
0730                 (wx.StaticText(self, -1, "Text qualifier:"), 0, wx.ALIGN_RIGHT | wx.ALIGN_CENTER_VERTICAL),
0731                 (self.textQualifierChoice, 0, wx.ALL | wx.ALIGN_LEFT | wx.ALIGN_CENTER_VERTICAL, 5),
0732                 ])
0733             
0734             delimOtherSizer.AddMany([
0735                 (self.consecutiveDelimsAs1, 1, wx.EXPAND | wx.ALL, 5),
0736                 (tqSizer, 1, wx.ALL | wx.ALIGN_CENTER, 5),
0737                 ])
0738 
0739             delimiterBox.AddMany([
0740                 (delimStaticSizer, 0, wx.ALIGN_CENTER),
0741                 (delimOtherSizer,  0, wx.ALIGN_CENTER),
0742                 ])
0743             
0744             delimStaticBox.Fit()
0745 
0746             # -------------
0747             self.displayRows = 6
0748             previewSettingsBox = wx.BoxSizer(wx.HORIZONTAL)
0749             self.hasHeaderRow = wx.CheckBox(self, -1, "First row is header")
0750             wx.EVT_CHECKBOX(self, self.hasHeaderRow.GetId(), self.BuildPreview)
0751 
0752             if wx.Platform in ('__WX.WXGTK__', '__WX.WXMSW__'):
0753                 # wx.SpinCtrl causes seg fault under GTK when <enter> is hit in text - use wx.SpinButton instead
0754                 self.previewRowsText = wx.TextCtrl(self, -1, str(self.displayRows),
0755                                                   size = (30, -1), style = wx.TE_PROCESS_ENTER)
0756                 h = self.previewRowsText.GetSize().height
0757                 self.previewRows = wx.SpinButton(self, -1, size = (-1, h), style = wx.SP_VERTICAL)
0758                 self.previewRows.SetRange(self.displayRows, 100)
0759                 self.previewRows.SetValue(self.displayRows)
0760                 wx.EVT_SPIN(self, self.previewRows.GetId(), self.OnSpinPreviewRows)
0761                 wx.EVT_TEXT_ENTER(self, self.previewRowsText.GetId(), self.OnTextPreviewRows)
0762             else:
0763                 self.previewRows = wx.SpinCtrl(self, -1, str(self.displayRows),
0764                                               min = self.displayRows, max = 100, size = (50, -1))
0765                 wx.EVT_SPINCTRL(self, self.previewRows.GetId(), self.BuildPreview)
0766 
0767             previewSettingsBox.AddMany([
0768                 (self.hasHeaderRow, 1, wx.ALL | wx.EXPAND, 5),
0769                 (wx.StaticText(self, -1, "Preview"), 0, wx.WEST | wx.ALIGN_RIGHT | wx.ALIGN_CENTER_VERTICAL, 10),
0770                 ])
0771             if wx.Platform in ('__WX.WXGTK__', '__WX.WXMSW__'):
0772                 previewSettingsBox.Add(self.previewRowsText, 0, wx.ALIGN_CENTER | wx.ALL, 3)
0773             previewSettingsBox.AddMany([
0774                 (self.previewRows, 0, wx.ALIGN_CENTER | wx.ALL, 3),
0775                 (wx.StaticText(self, -1, "rows"), 0, wx.ALIGN_RIGHT | wx.ALIGN_CENTER_VERTICAL),
0776                 ])
0777 
0778             # -------------
0779             if delimiter is not None:
0780                 previewData = importDSV(newdata[:self.displayRows],
0781                                         textQualifier = textQualifier,
0782                                         delimiter = delimiter,
0783                                         errorHandler = padRow)
0784                 hasHeaders = guessHeaders(previewData)
0785                 self.hasHeaderRow.SetValue(hasHeaders)
0786 
0787                 cols = len(previewData[0])
0788             else:
0789                 previewData = []
0790                 hasHeaders = 0
0791                 cols = 1
0792 
0793             previewStaticBox = wx.StaticBox(self, -1, "Data Preview")
0794             previewStaticSizer = wx.StaticBoxSizer(previewStaticBox, wx.VERTICAL)
0795             self.preview = gridlib.Grid(self, -1)
0796             self.preview.CreateGrid(self.displayRows, cols)
0797             self.preview.SetDefaultRowSize(self.preview.GetCharHeight() + 4, True)
0798             self.preview.EnableEditing(False)
0799             self.preview.SetColLabelSize(0)
0800             self.preview.SetRowLabelSize(0)
0801             self.preview.SetMargins(1, 0)
0802             self.initialized = True
0803             self.BuildPreview()
0804 
0805             rowheight = self.preview.GetRowSize(0) + 2
0806             self.preview.SetSize((-1, rowheight * self.displayRows))
0807             previewStaticSizer.Add(self.preview, 0, wx.ALL | wx.EXPAND, 5)
0808 
0809             # -------------
0810             mainSizer.AddMany([
0811                 (message1,     0, wx.ALL, 5),
0812                 (delimiterBox, 0, wx.ALL, 5),
0813                 (previewSettingsBox, 0, wx.ALL, 5),
0814                 (previewStaticSizer, 0, wx.ALL | wx.EXPAND, 5),
0815                 ])
0816 
0817             self.Layout()
0818             self.Fit()
0819 
0820         def OnSpinPreviewRows(self, event):
0821             self.previewRowsText.SetValue(str(event.GetPosition()))
0822             self.BuildPreview()
0823 
0824         def OnTextPreviewRows(self, event):
0825             try:    v = int(self.previewRowsText.GetValue())
0826             except: v = self.displayRows
0827             v = max(self.displayRows, v)
0828             v = min(v, 100)
0829             self.previewRowsText.SetValue(str(v))
0830             self.previewRows.SetValue(v)
0831             self.BuildPreview()
0832 
0833         def Validate(self, event = None):
0834             hasDelimiter = reduce(lambda a, b: a + b, [cb.GetValue() for cb in self.delimChecks.values()])
0835             if hasDelimiter == 1 and self.delimChecks['Other'].GetValue():
0836                 hasDelimiter = self.otherDelim.GetValue() != ""
0837             self.BuildPreview()
0838             self.isValidCallback(hasDelimiter)
0839 
0840         def BuildPreview(self, event = None):
0841             if not self.initialized:
0842                 return # got triggered before initialization was completed
0843 
0844             if wx.Platform != '__WX.WXGTK__':
0845                 wx.BeginBusyCursor() # causes a spurious spin event under GTK
0846             wx.Yield() # allow controls to update first, in case of slow preview
0847             self.preview.BeginBatch()
0848             self.preview.DeleteCols(0, self.preview.GetNumberCols())
0849             self.preview.DeleteRows(0, self.preview.GetNumberRows())
0850             self.preview.ClearGrid()
0851 
0852             textQualifier = self.textQualifierChoice.GetStringSelection()
0853             if textQualifier == '{None}': textQualifier = None
0854             other = self.otherDelim.GetValue()
0855             delimiter = map(lambda i, other = other: i[0] != 'Other' and i[0] or other,
0856                             filter(lambda i: i[1].GetValue(), self.delimChecks.items()))
0857             delimiter = "".join(delimiter)
0858 
0859             rows = self.previewRows.GetValue()
0860 
0861             newdata = organizeIntoLines(self.data, textQualifier, limit = rows)
0862             try:
0863                 previewData = importDSV(newdata[:rows],
0864                                         textQualifier = textQualifier,
0865                                         delimiter = delimiter,
0866                                         errorHandler = padRow)
0867             except InvalidDelimiter, e:
0868                 previewData = map(lambda i: [i], newdata[:rows])
0869 
0870             rows = min(rows, len(previewData))
0871             hasHeaders = self.hasHeaderRow.GetValue()
0872             self.preview.AppendRows(rows - hasHeaders)
0873             cols = max([len(row) for row in previewData])
0874             self.preview.AppendCols(cols)
0875 
0876             if hasHeaders:
0877                 self.preview.SetColLabelSize(self.preview.GetRowSize(0))
0878                 for col in range(cols):
0879                     try:    self.preview.SetColLabelValue(col, str(previewData[0][col]))
0880                     except: self.preview.SetColLabelValue(col, "")
0881                 # self.preview.AutoSizeColumns(True) # size columns to headers
0882             else:
0883                 self.preview.SetColLabelSize(0)
0884 
0885             for row in range(hasHeaders, rows):
0886                 for col in range(cols):
0887                     try:    self.preview.SetCellValue(row - hasHeaders, col, str(previewData[row][col]))
0888                     except: pass
0889 
0890             # if not hasHeaders:
0891             self.preview.AutoSizeColumns(True) # size columns to data
0892 
0893             rowheight = self.preview.GetRowSize(0)
0894             self.preview.SetRowSize(0, rowheight)
0895             self.preview.EndBatch()
0896             if wx.Platform != '__WX.WXGTK__':
0897                 wx.EndBusyCursor()
0898 
0899             self.delimiters = delimiter
0900             self.textQualifier = textQualifier
0901             self.hasHeaders = hasHeaders
0902 
0903         def OnCustomDelim(self, event = None):
0904             self.delimChecks['Other'].SetValue(len(self.otherDelim.GetValue()))
0905             self.Validate()
0906 
0907         def GetDelimiters(self):
0908             return self.delimiters
0909 
0910         def GetTextQualifier(self):
0911             return self.textQualifier
0912 
0913         def GetHasHeaders(self):
0914             return self.hasHeaders
0915 
0916     # ------------------------------------------------------------------------------        
0917     class ImportWizardDialog(wx.Dialog):
0918         """
0919         CLASS(SUPERCLASS):
0920           ImportWizardDialog(wx.Dialog)
0921         DESCRIPTION:
0922           A dialog allowing the user to preview and change the options for importing
0923           a file.
0924         PROTOTYPE:
0925           ImportWizardDialog(parent, id, title, file,
0926                              pos = wx.DefaultPosition, size = wx.DefaultSize,
0927                              style = wx.DEFAULT_DIALOG_STYLE, name = 'ImportWizardDialog')
0928         ARGUMENTS:
0929           - parent: the parent window
0930           - id: the id of this window
0931           - title: the title of this dialog
0932           - file: the file to import
0933         METHODS:
0934           - GetImportInfo()
0935             returns a tuple (delimiters, text qualifiers, has headers)
0936           - ImportData(errorHandler = skipRow)
0937             returns (headers, data), headers may be None
0938             errorHandler is a callback function that instructs the method on what
0939             to do with irregular rows. The default skipRow function simply discards
0940             the bad row (see importDSV() above).
0941         """
0942 
0943         def __init__(self, parent, id, title, file,
0944                      pos = wx.DefaultPosition, size = wx.DefaultSize,
0945                      style = wx.DEFAULT_DIALOG_STYLE, name = "ImportWizardDialog"):
0946             wx.Dialog.__init__(self, parent, id, title, pos, size, style, name)
0947             self.SetAutoLayout(True)
0948 
0949             self.file = file
0950             f = open(file, 'r')
0951             self.data = f.read()
0952             f.close()
0953 
0954             sizer = wx.BoxSizer(wx.VERTICAL)
0955             self.delimPanel = ImportWizardPanel_Delimiters(self, -1, file, self.data, self.ValidState)
0956             buttonBox = self.ButtonBox()
0957             sizer.AddMany([
0958                 (self.delimPanel, 0, wx.ALL, 5),
0959                 (buttonBox, 0, wx.SOUTH | wx.ALIGN_CENTER_HORIZONTAL | wx.ALIGN_TOP, 0),
0960                 ])
0961 
0962             self.SetSizer(sizer)
0963             self.Layout()
0964             sizer.Fit(self.delimPanel)
0965             self.Fit()
0966             self.Centre()
0967 
0968         def ButtonBox(self):
0969             panel = wx.Panel(self, -1)
0970             panel.SetAutoLayout(True)
0971             sizer = wx.BoxSizer(wx.HORIZONTAL)
0972             panel.SetSizer(sizer)
0973             self.ok = wx.Button(panel, wx.ID_OK, "Ok")
0974             cancel = wx.Button(panel, wx.ID_CANCEL, "Cancel")
0975             sizer.AddMany([
0976                 (self.ok, 0, wx.ALIGN_TOP | wx.EAST | wx.SOUTH, 10),
0977                 (cancel, 0, wx.ALIGN_TOP | wx.WEST | wx.SOUTH, 10),
0978                 ])
0979             panel.Layout()
0980             panel.Fit()
0981             return panel
0982 
0983         def GetImportInfo(self):
0984             return (self.delimPanel.GetDelimiters(),
0985                     self.delimPanel.GetTextQualifier(),
0986                     self.delimPanel.GetHasHeaders())
0987 
0988         def ImportData(self, errorHandler = skipRow):
0989             delimiters, qualifier, hasHeaders = self.GetImportInfo()
0990             self.data = organizeIntoLines(self.data, textQualifier = qualifier)
0991             dlg = wx.ProgressDialog("Import DSV File",
0992                                    self.file,
0993                                    100,
0994                                    self,
0995                                    wx.PD_CAN_ABORT | wx.PD_APP_MODAL | wx.PD_AUTO_HIDE)
0996             self.data = importDSV(self.data,
0997                                   delimiter = delimiters,
0998                                   textQualifier = qualifier,
0999                                   updateFunction = dlg.Update,
1000                                   errorHandler = errorHandler)
1001             if self.data is None: return None
1002             if hasHeaders:
1003                 headers = copy.copy(self.data[0])
1004                 del self.data[0]
1005             else:
1006                 headers = None
1007             return (headers, self.data)
1008 
1009         def ValidState(self, isValid):
1010             self.ok.Enable(isValid)
1011 
1012 
1013 # ------------------------------------------------------------------------------
1014 if __name__ == '__main__':
1015     if wx is None:
1016         print "\nYou need wxPython to run this sample*."
1017         print "\n*Note that wxPython is _not_ necessary to use this module, but it is required"
1018         print "to use the wizard dialog (which the sample requires)."
1019         raise SystemExit
1020 
1021 
1022     def demo():
1023         class SampleApp(wx.App):
1024             def OnInit(self):
1025                 dlg = wx.FileDialog(None, "Choose a file", ".", "",
1026                                    "CSV files (*.csv)|*.csv|Text files (*.txt)|*.txt|All files (*.*)|*.*",
1027                                    wx.OPEN)
1028                 if dlg.ShowModal() == wx.ID_OK:
1029                     path = dlg.GetPath()
1030                     dlg.Destroy()
1031 
1032                     errorLog = open('import_error.log', 'a+') 
1033                     def logErrors(oldrow, newrow, expectedColumns, maxColumns, file = errorLog):
1034                         # log the bad row to a file
1035                         file.write(oldrow + '\n')
1036 
1037                     dlg = ImportWizardDialog(None, -1, 'CSV Import Wizard (v.%s)' % __version__, path)
1038                     if dlg.ShowModal() == wx.ID_OK:
1039                         results = dlg.ImportData(errorHandler = logErrors)
1040                         dlg.Destroy()
1041                         errorLog.close()
1042 
1043                         if results is not None:
1044                             headers, data = results
1045                             if True: # print the output to stdout
1046                                 if headers:
1047                                     print headers
1048                                     print 80*'='
1049                                 for row in data:
1050                                     print row
1051 
1052                             if True: # for testing export functionality
1053                                 if headers:
1054                                     print exportDSV([headers] + data)
1055                                 else:
1056                                     print exportDSV(data)
1057                     else:
1058                         dlg.Destroy()
1059 
1060                 else:
1061                     dlg.Destroy()
1062 
1063                 return True
1064 
1065         app = SampleApp()
1066         app.MainLoop()
1067 
1068     # import profile
1069     # profile.run('demo()')
1070     demo()
1071
Generated by PyXR 0.9.4