]> git.sur5r.net Git - glabels/commitdiff
2007-05-13 Jim Evins <evins@snaught.com>
authorJim Evins <evins@snaught.com>
Sun, 13 May 2007 23:38:05 +0000 (23:38 +0000)
committerJim Evins <evins@snaught.com>
Sun, 13 May 2007 23:38:05 +0000 (23:38 +0000)
* src/merge-text.c: (parse_line), (parse_field):
Remove leading and trailing spaces from CSV fields, unless quoted.

git-svn-id: https://glabels.svn.sourceforge.net/svnroot/glabels/trunk@661 f5e0f49d-192f-0410-a22d-a8d8700d0965

glabels2/ChangeLog
glabels2/src/merge-text.c

index 7c009bf8bf3515379ce931a9a60d55f9f00ba3a3..4c964c5654a93d7511fcffe02dbbf78a65c6c30d 100644 (file)
@@ -1,3 +1,8 @@
+2007-05-13  Jim Evins  <evins@snaught.com>
+
+       * src/merge-text.c: (parse_line), (parse_field):
+               Remove leading and trailing spaces from CSV fields, unless quoted.
+
 2007-05-09  Jim Evins  <evins@snaught.com>
 
        * src/object-editor-bc-page.c:
index 77afcfedc0b93b6006561596d3956dcecb7c6d83..674e81d13a37af47928380d8dc9d68598b0fee76 100644 (file)
@@ -81,6 +81,7 @@ static void           gl_merge_text_copy            (glMerge          *dst_merge
 
 static GList         *parse_line                    (FILE             *fp,
                                                     gchar             delim);
+static gchar         *parse_field                   (gchar            *raw_field);
 static void           free_fields                   (GList           **fields);
 
 \f
@@ -358,7 +359,24 @@ gl_merge_text_copy (glMerge *dst_merge,
 }
 
 /*---------------------------------------------------------------------------*/
-/* PRIVATE.  Parse line (quoted values may span multiple lines).             */
+/* PRIVATE.  Parse line.                                                     */
+/*                                                                           */
+/* Attempt to be a robust parser of various CSV (and similar) formats.       */
+/*                                                                           */
+/* Split into fields, accounting for:                                        */
+/*   - delimeters may be embedded in quoted text (")                         */
+/*   - delimeters may be "escaped" by a leading backslash (\)                */
+/*   - quotes may be embedded in quoted text as two adjacent quotes ("")     */
+/*   - quotes may be "escaped" either within or outside of quoted text.      */
+/*   - newlines may be embedded in quoted text, allowing a field to span     */
+/*     more than one line.                                                   */
+/*                                                                           */
+/* This function does not do any parsing of the individual fields, other     */
+/* than to correctly interpet delimeters.  Actual parsing of the individual  */
+/* fields is done in parse_field().                                          */
+/*                                                                           */
+/* Returns a list of fields.  A blank line is considered a line with one     */
+/* empty field.  Returns empty (NULL) when done.                             */
 /*---------------------------------------------------------------------------*/
 static GList *
 parse_line (FILE  *fp,
@@ -367,9 +385,8 @@ parse_line (FILE  *fp,
        GList *list = NULL;
        GString *string;
        gint c;
-       enum { BEGIN, NORMAL, NORMAL_ESCAPED,
-              QUOTED, QUOTED_ESCAPED, QUOTED_QUOTE1,
-              DONE } state;
+       enum { BEGIN, NORMAL, QUOTED, QUOTED_QUOTE1,
+               NORMAL_ESCAPED, QUOTED_ESCAPED, DONE } state;
 
        state = BEGIN;
        string = g_string_new( "" );
@@ -379,70 +396,204 @@ parse_line (FILE  *fp,
                switch (state) {
 
                case BEGIN:
+                        if ( c == delim )
+                        {
+                                /* first field is empty. */
+                                list = g_list_append (list, g_strdup (""));
+                               state = NORMAL;
+                                break;
+                        }
                        switch (c) {
-                       case '\\':
-                               state = NORMAL_ESCAPED;
-                               break;
                        case '"':
+                                string = g_string_append_c (string, c);
                                state = QUOTED;
                                break;
-                       case '\r':
-                               /* Strip CR. */
-                               state = NORMAL;
+                       case '\\':
+                                string = g_string_append_c (string, c);
+                               state = NORMAL_ESCAPED;
                                break;
                        case '\n':
                                /* treat as one empty field. */
-                               list = g_list_append (list,
-                                                     g_strdup (""));
+                               list = g_list_append (list, g_strdup (""));
                                state = DONE;
                                break;
                        case EOF:
+                                /* end of file, no more lines. */
                                state = DONE;
                                break;
                        default:
-                               if ( c != delim ) {
-                                       string = g_string_append_c (string, c);
-                               } else {
-                                       list = g_list_append (list,
-                                                     g_strdup (string->str));
-                                       string = g_string_assign( string, "" );
-                               }
+                                string = g_string_append_c (string, c);
                                state = NORMAL;
                                break;
                        }
                        break;
 
                case NORMAL:
+                        if ( c == delim )
+                        {
+                                list = g_list_append (list, parse_field (string->str));
+                                string = g_string_assign( string, "" );
+                                state = NORMAL;
+                                break;
+                        }
                        switch (c) {
+                       case '"':
+                                string = g_string_append_c (string, c);
+                               state = QUOTED;
+                               break;
                        case '\\':
+                                string = g_string_append_c (string, c);
                                state = NORMAL_ESCAPED;
                                break;
+                       case '\n':
+                       case EOF:
+                               list = g_list_append (list, parse_field (string->str));
+                               state = DONE;
+                               break;
+                       default:
+                                string = g_string_append_c (string, c);
+                                state = NORMAL;
+                               break;
+                       }
+                       break;
+
+               case QUOTED:
+                       switch (c) {
                        case '"':
-                               state = QUOTED;
+                                string = g_string_append_c (string, c);
+                               state = QUOTED_QUOTE1;
                                break;
-                       case '\r':
-                               /* Strip CR. */
+                       case '\\':
+                                string = g_string_append_c (string, c);
+                               state = QUOTED_ESCAPED;
+                               break;
+                       case EOF:
+                               /* File ended mid way through quoted item */
+                               list = g_list_append (list, parse_field (string->str));
+                               state = DONE;
+                               break;
+                       default:
+                               string = g_string_append_c (string, c);
+                               break;
+                       }
+                       break;
+
+               case QUOTED_QUOTE1:
+                        if ( c == delim )
+                        {
+                                list = g_list_append (list, parse_field (string->str));
+                                string = g_string_assign( string, "" );
+                                state = NORMAL;
+                                break;
+                        }
+                       switch (c) {
+                       case '"':
+                               /* insert quotes in string, stay quoted. */
+                               string = g_string_append_c (string, c);
+                               state = QUOTED;
                                break;
                        case '\n':
                        case EOF:
-                               list = g_list_append (list,
-                                                     g_strdup (string->str));
+                               /* line or file ended after quoted item */
+                               list = g_list_append (list, parse_field (string->str));
                                state = DONE;
                                break;
                        default:
-                               if ( c != delim ) {
-                                       string = g_string_append_c (string, c);
-                               } else {
-                                       list = g_list_append (list,
-                                                     g_strdup (string->str));
-                                       string = g_string_assign( string, "" );
-                               }
+                                string = g_string_append_c (string, c);
+                               state = NORMAL;
                                break;
                        }
                        break;
 
                case NORMAL_ESCAPED:
                        switch (c) {
+                       case EOF:
+                               /* File ended mid way through quoted item */
+                               list = g_list_append (list, parse_field (string->str));
+                               state = DONE;
+                               break;
+                       default:
+                               string = g_string_append_c (string, c);
+                               state = NORMAL;
+                               break;
+                       }
+                       break;
+
+               case QUOTED_ESCAPED:
+                       switch (c) {
+                       case EOF:
+                               /* File ended mid way through quoted item */
+                               list = g_list_append (list, parse_field (string->str));
+                               state = DONE;
+                               break;
+                       default:
+                               string = g_string_append_c (string, c);
+                               state = QUOTED;
+                               break;
+                       }
+                       break;
+
+               default:
+                       g_assert_not_reached();
+                       break;
+               }
+
+       }
+       g_string_free( string, TRUE );
+
+       return list;
+}
+
+/*---------------------------------------------------------------------------*/
+/* PRIVATE.  Parse field.                                                    */
+/*                                                                           */
+/*  - Strip leading and trailing white space, unless quoted.                 */
+/*  - Strip CR, unless escaped.                                              */
+/*  - Expand '\n' and '\t' into newline and tab characters.                  */
+/*  - Remove quotes, unless escaped (\" anywhere or "" within quotes)        */
+/*---------------------------------------------------------------------------*/
+static gchar *
+parse_field (gchar  *raw_field)
+{
+       GString *string;
+        gchar   *pass1_field, *c, *field;
+       enum { NORMAL, NORMAL_ESCAPED, QUOTED, QUOTED_ESCAPED, QUOTED_QUOTE1} state;
+
+
+        /*
+         * Pass 1: remove leading and trailing spaces.
+         */
+        pass1_field = g_strdup (raw_field);
+        g_strstrip (pass1_field);
+
+        /*
+         * Pass 2: resolve quoting and escaping.
+         */
+       state = NORMAL;
+       string = g_string_new( "" );
+        for ( c=pass1_field; *c != 0; c++ )
+        {
+               switch (state) {
+
+               case NORMAL:
+                       switch (*c) {
+                       case '\\':
+                               state = NORMAL_ESCAPED;
+                               break;
+                       case '"':
+                               state = QUOTED;
+                               break;
+                       case '\r':
+                               /* Strip CR. */
+                               break;
+                       default:
+                                string = g_string_append_c (string, *c);
+                               break;
+                       }
+                       break;
+
+               case NORMAL_ESCAPED:
+                       switch (*c) {
                        case 'n':
                                string = g_string_append_c (string, '\n');
                                state = NORMAL;
@@ -451,21 +602,15 @@ parse_line (FILE  *fp,
                                string = g_string_append_c (string, '\t');
                                state = NORMAL;
                                break;
-                       case '\r':
-                               /* Strip CR, stay ESCAPED. */
-                               break;
-                       case EOF:
-                               state = DONE;
-                               break;
                        default:
-                               string = g_string_append_c (string, c);
+                               string = g_string_append_c (string, *c);
                                state = NORMAL;
                                break;
                        }
                        break;
 
                case QUOTED:
-                       switch (c) {
+                       switch (*c) {
                        case '\\':
                                state = QUOTED_ESCAPED;
                                break;
@@ -475,20 +620,14 @@ parse_line (FILE  *fp,
                        case '\r':
                                /* Strip CR. */
                                break;
-                       case EOF:
-                               /* File ended mid way through quoted item */
-                               list = g_list_append (list,
-                                                     g_strdup (string->str));
-                               state = DONE;
-                               break;
                        default:
-                               string = g_string_append_c (string, c);
+                               string = g_string_append_c (string, *c);
                                break;
                        }
                        break;
 
                case QUOTED_ESCAPED:
-                       switch (c) {
+                       switch (*c) {
                        case 'n':
                                string = g_string_append_c (string, '\n');
                                state = QUOTED;
@@ -497,48 +636,26 @@ parse_line (FILE  *fp,
                                string = g_string_append_c (string, '\t');
                                state = QUOTED;
                                break;
-                       case '\r':
-                               /* Strip CR, stay ESCAPED. */
-                               break;
-                       case EOF:
-                               /* File ended mid way through quoted item */
-                               list = g_list_append (list,
-                                                     g_strdup (string->str));
-                               state = DONE;
-                               break;
                        default:
-                               string = g_string_append_c (string, c);
+                               string = g_string_append_c (string, *c);
                                state = QUOTED;
                                break;
                        }
                        break;
 
                case QUOTED_QUOTE1:
-                       switch (c) {
+                       switch (*c) {
                        case '"':
                                /* insert quotes in string, stay quoted. */
-                               string = g_string_append_c (string, c);
+                               string = g_string_append_c (string, *c);
                                state = QUOTED;
                                break;
                        case '\r':
-                               /* Strip CR, return to NORMAL. */
-                               state = NORMAL;
-                               break;
-                       case '\n':
-                       case EOF:
-                               /* line or file ended after quoted item */
-                               list = g_list_append (list,
-                                                     g_strdup (string->str));
-                               state = DONE;
+                               /* Strip CR, return to QUOTED. */
+                               state = QUOTED;
                                break;
                        default:
-                               if ( c != delim ) {
-                                       string = g_string_append_c (string, c);
-                               } else {
-                                       list = g_list_append (list,
-                                                     g_strdup (string->str));
-                                       string = g_string_assign( string, "" );
-                               }
+                                string = g_string_append_c (string, *c);
                                state = NORMAL;
                                break;
                        }
@@ -550,9 +667,12 @@ parse_line (FILE  *fp,
                }
 
        }
+
+        field = g_strdup (string->str);
        g_string_free( string, TRUE );
+        g_free (pass1_field);
 
-       return list;
+       return field;
 }
 
 /*---------------------------------------------------------------------------*/