Source File csv/csv.ils

    1 ;;; csv.ils --- Simple CSV parser (Excel-style, text/csv)
    2 
    3 ;; Copyright (C) 2012  Damien Diederen
    4 
    5 ;; @author   Damien Diederen <dd@crosstwine.com>
    6 ;; @keywords csv
    7 
    8 ;; All Rights Reserved.
    9 ;;
   10 ;; NOTICE: All information, intellectual and technical concepts
   11 ;; contained herein are, and remain the property of Damien Diederen
   12 ;; and his suppliers, if any.  Dissemination of this information or
   13 ;; reproduction of this material is strictly forbidden unless prior
   14 ;; written permission is obtained from Damien Diederen.
   15 
   16 ;;; Commentary:
   17 
   18 ;; Package csv (global prefix VedaCsv) provides a simple parser
   19 ;; matching the syntax produced by Microsoft Excel in an U.S. locale.
   20 
   21 ;;; Code:
   22 
   23 ;; Parser holds the CSV parser state.
   24 ;;
   25 ;; @ignore Internal
   26 (defclass VedaCsvParser ()
   27   ((filename @initarg filename)
   28    (line     @initform 1)
   29    (cell     @initform nil)
   30    (row      @initform nil)
   31    (rows     @initform nil)))
   32 
   33 ;; Scope/hide utility functions and related data.
   34 (let ()
   35   ;; Export exports function fn as a global symbol with the VedaCsv
   36   ;; prefix.
   37   (defun Export (suffix fn)
   38     (putd (concat 'VedaCsv suffix) fn))
   39 
   40   ;; Constants used during parsing.
   41 
   42   (define asciiQuotationMark (intToChar 0x22))
   43 
   44   (define asciiComma (intToChar 0x2c))
   45 
   46   (define asciiLineFeed (intToChar 0xa))
   47 
   48   (define asciiCarriageReturn (intToChar 0x0d))
   49 
   50   (define endOfRow (list nil asciiLineFeed))
   51 
   52   ;; Create a parser instance
   53   (defun MakeParser (filename)
   54     (makeInstance 'VedaCsvParser ?filename filename))
   55 
   56   ;; Parser state manipulation.
   57 
   58   (defun FinishCell (p)
   59     (letseq ((str (buildString (reverse p->cell) ""))
   60              (newrow (cons str p->row)))
   61       (p->row = newrow)
   62       (p->cell = nil)))
   63 
   64   (defun FinishRow (p)
   65     (let ((newrows (cons (reverse p->row) p->rows)))
   66       (p->rows = newrows)
   67       (p->row = nil)
   68       (p->cell = nil)))
   69 
   70   (defun Rows (p)
   71     (reverse p->rows))
   72 
   73   (defun PushToCell (p c)
   74     (let ((newcell (cons c p->cell)))
   75       (p->cell = newcell)))
   76 
   77   (defun Error (p msg)
   78     (let (pieces)
   79       (push (sprintf nil "%L" p->line) pieces)
   80       (push (or p->filename "<input>") pieces)
   81       (let ((prefix (buildString pieces ":")))
   82         (error "%s: %s" prefix msg))))
   83 
   84   ;; State functions.
   85 
   86   ;; Cursor is at the beginning of a row of records, either because we
   87   ;; are starting or because newline has just been seen.
   88   (defun NewRowState (p c)
   89     (unless (null c)
   90       (p->row = nil)
   91       (p->cell = nil)
   92       (CellState p c)))
   93 
   94   ;; Cursor is collecting characters for the current cell.
   95   (defun CellState (p c)
   96     (cond
   97       ((eq c asciiComma)
   98        (FinishCell p)
   99        CellState)
  100       ((eq c asciiQuotationMark)
  101        QuotedCellState)
  102       ((memq c endOfRow)
  103        (FinishCell p)
  104        (FinishRow p)
  105        (when c
  106          NewRowState))
  107       ((eq c asciiCarriageReturn)
  108        ;; Ignored.
  109        CellState)
  110       (t
  111        (PushToCell p c)
  112        CellState)))
  113 
  114   ;; An ASCII quotation mark has been seen; the cursor is collecting
  115   ;; characters within quoted cell contents.
  116   (defun QuotedCellState (p c)
  117     (cond
  118       ((eq c asciiQuotationMark)
  119        QuotedCellStateSeenQuote)
  120       ((null c)
  121        (Error p "Unexpected EOF within quoted CSV cell."))
  122       (t
  123        (PushToCell p c)
  124        QuotedCellState)))
  125 
  126   ;; An ASCII quotation mark has been seen within a quoted section; we
  127   ;; don't know whether it closes the section or is an escape for a
  128   ;; literal quotation mark.
  129   (defun QuotedCellStateSeenQuote (p c)
  130     (cond
  131       ((eq c asciiQuotationMark)
  132        (PushToCell p c)
  133        QuotedCellState)
  134       (t
  135        (CellState p c))))
  136 
  137   ;; Parsing functions
  138 
  139   ;; Parse the characters returned by the source cs, a function
  140   ;; returning either the character symbol or nil for EOF.
  141   ;;
  142   ;; If passed and non-nil, filename may be used for diagnostic
  143   ;; purposes.
  144   (defun ParseCharStream (cs @optional filename)
  145     (let ((p (MakeParser filename))
  146           (state NewRowState))
  147       (while state
  148         (let ((c (funcall cs)))
  149           (when (eq c asciiLineFeed)
  150             (p->line = (p->line + 1)))
  151           (setq state (funcall state p c))))
  152       (Rows p)))
  153 
  154   ;; Makes a character source which consumes lines, as described in
  155   ;; ParseLines.
  156   (defun MakeLinesCharStream (lines)
  157     (cond
  158       ((null lines)
  159        (lambda () nil))
  160       (t
  161        (let ((index 1)
  162              (end (strlen (car lines)))
  163              (line (pop lines)))
  164          (lambda ()
  165            (cond
  166              ((index > end)
  167               (when lines
  168                 (setq line (pop lines))
  169                 (setq index 1)
  170                 (setq end (strlen line))
  171                 asciiLineFeed))
  172              (t
  173               (prog1
  174                   (getchar line index)
  175                 (setq index (index + 1))))))))))
  176 
  177   ;; ParseLines parses a list of string "lines" as a CSV stream.  The
  178   ;; list of strings is interpreted as if joigned by:
  179   ;;
  180   ;;     (buildString lines "\n")
  181   ;;
  182   ;; but the code doesn't do that to avoid triggering string length
  183   ;; limits.  Note that each "line" can also embed one or more \n
  184   ;; characters.
  185   ;;
  186   ;; Cf. ParseFile for more information and return specification.
  187   (defun ParseLines (lines)
  188     (ParseCharStream (MakeLinesCharStream lines)))
  189 
  190   (Export 'ParseLines ParseLines)
  191 
  192   ;; ParsePort parses text read from port as a CSV stream.  Parsing
  193   ;; finishes when EOF is reached, but the port is not closed.
  194   ;;
  195   ;; Cf. ParseFile for more information and return specification.
  196   (defun ParsePort (port)
  197     (ParseCharStream (lambda () (getc port))))
  198 
  199   (Export 'ParsePort ParsePort)
  200 
  201   ;; ParseFile parses filename as a CSV stream, returning a list of
  202   ;; lists of strings; the upper-left corner of a spreadsheet filled
  203   ;; with cell addresses would parse as:
  204   ;;
  205   ;;     (("A1" "B1" "C1")
  206   ;;      ("A2" "B2" "C2")
  207   ;;      ("A3" "B3" "C3"))
  208   ;;
  209   ;; Escape sequences and in-cell carriage returns are supported, but
  210   ;; no data interpretation is done besides parsing to strings.
  211   ;;
  212   ;; Errors are thrown if filename cannot be open for reading, or when
  213   ;; EOF is encountered within a quoted cell.
  214   ;;
  215   ;; "Staggered" spreadsheets, or streams ending without a carriage
  216   ;; return, are not error conditions; they produce the "obvious"
  217   ;; result:
  218   ;;
  219   ;;     a,b,c\n         (("a" "b" "c")
  220   ;;     d,e\n      =>    ("d" "e")
  221   ;;     f,g,h\n          ("f" "g" "h")
  222   ;;     i                ("i"))
  223   (defun ParseFile (filename)
  224     (let ((port (or (infile filename)
  225                     (error "Unable to open %L for reading" filename))))
  226       ;; TODO: unwindProtect, depending on Virtuoso version.
  227       (prog1
  228           (ParseCharStream (lambda () (getc port)) filename)
  229         (close port))))
  230 
  231   (Export 'ParseFile ParseFile))
  232 
  233 ;;; csv.ils ends here