- Correctly handles unicode characters that have a number of ascii alternates not equal to 0. - Adds a --utf8 command line argument to output raw utf-8 in text output mode. diff -u unrtf-0.21.0/src/convert.c unrtf-0.21.0-patch/src/convert.c --- unrtf-0.21.0/src/convert.c 2008-12-13 08:04:59.000000000 +1000 +++ unrtf-0.21.0-patch/src/convert.c 2009-09-11 17:27:22.000000000 +1000 @@ -96,6 +96,7 @@ #include "word.h" #include "hash.h" #include "convert.h" +#include "unicode.h" #ifndef HAVE_ATTR_H #include "attr.h" @@ -527,6 +528,7 @@ extern int nopict_mode; +extern int utf8_text_mode; /* #define BINARY_ATTRS @@ -603,6 +605,11 @@ static int banner_printed=FALSE; +/* Number of words to skip after a unicode character, + set by \uc. +*/ +static size_t unicode_word_skip_count = 1; + /*======================================================================== * Name: print_banner @@ -2139,14 +2146,18 @@ printf("%s", alias); done++; } - else - if(!done && op->unisymbol_print) + else if(!done){ + if(op->unisymbol_print) { sprintf(tmp, "%d", param); if (safe_printf(1, op->unisymbol_print, tmp)) fprintf(stderr, TOO_MANY_ARGS, "unisymbol_print"); done++; + } else if(utf8_text_mode){ + printf(unicode_to_static_string(param)); + done++; } + } /* ** if we know how to represent the unicode character in the @@ -2154,11 +2165,25 @@ ** we will output that alternative. */ if (done) - return(SKIP_ONE_WORD); + return(SKIP_UNICODE_ALTERNATE); return(FALSE); } /*======================================================================== + * Name: cmd_uc + * Purpose: Executes the \uc command, setting the unicode skip count. + * Args: Word, paragraph align info, and numeric param if any. + * Returns: Flag, true only if rest of Words on line should be ignored. + *=======================================================================*/ + +static int cmd_uc (Word *w, int align, char has_param, int param) { + if (has_param && param >= 0) { + unicode_word_skip_count = param; + } + return FALSE; +} + +/*======================================================================== * Name: cmd_dn * Purpose: Executes the \dn command. * Args: Word, paragraph align info, and numeric param if any. @@ -2933,6 +2958,7 @@ }; static HashItem hashArray_u [] = { { "u", &cmd_u, NULL }, + { "uc", &cmd_uc, NULL }, { "ul", &cmd_ul, NULL }, { "up", &cmd_up, NULL }, { "uld", &cmd_uld, NULL }, @@ -3214,6 +3240,7 @@ int is_cell_group=FALSE; int paragraph_begined=FALSE; int paragraph_align=ALIGN_LEFT; + int i; CHECK_PARAM_NOT_NULL(w); @@ -3304,7 +3331,6 @@ if (op->word_begin) if (safe_printf(0, op->word_begin)) fprintf(stderr, TOO_MANY_ARGS, "word_begin"); - print_with_special_exprs (s); if (op->word_end) @@ -3465,13 +3491,12 @@ //unicode terminate_group = hip[index].func (w,paragraph_align, have_param, param); /* daved - 0.19.4 - unicode support may need to skip only one word */ - if (terminate_group == SKIP_ONE_WORD) - w=w->next; - else - if (terminate_group) + if (terminate_group == SKIP_UNICODE_ALTERNATE) { + for(i=0;i < unicode_word_skip_count && w;i++,w=w->next); + } else if (terminate_group) { while(w) w=w->next; + } } - debug=hip[index].debug_print; if (debug && debug_mode) { diff -u unrtf-0.21.0/src/defs.h unrtf-0.21.0-patch/src/defs.h --- unrtf-0.21.0/src/defs.h 2008-12-13 10:29:57.000000000 +1000 +++ unrtf-0.21.0-patch/src/defs.h 2009-09-11 17:27:22.000000000 +1000 @@ -61,10 +61,10 @@ #define FALSE (0) #endif #if 1 /* daved - 0.19.4 */ -#define SKIP_ONE_WORD 2 +#define SKIP_UNICODE_ALTERNATE 2 #endif -#define USAGE "unrtf [--version] [--verbose] [--help] [--nopict|-n] [--noremap] [--html] [--text] [--vt] [--latex] [--rtf] [-P config_search_path] [-t )] " +#define USAGE "unrtf [--version] [--verbose] [--help] [--nopict|-n] [--utf8] [--noremap] [--html] [--text] [--vt] [--latex] [--rtf] [-P config_search_path] [-t )] " /* Default names for RTF's default fonts */ diff -u unrtf-0.21.0/src/main.c unrtf-0.21.0-patch/src/main.c --- unrtf-0.21.0/src/main.c 2008-12-13 10:21:36.000000000 +1000 +++ unrtf-0.21.0-patch/src/main.c 2009-09-11 17:27:22.000000000 +1000 @@ -94,6 +94,9 @@ int verbose_mode; /* TRUE => Output additional informations about unrtf */ int no_remap_mode; /* don't remap codepoints */ +int utf8_text_mode; /* TRUE => Write unicode characters as UTF-8 when unisymbol_print + isn't defined. */ + OutputPersonality *op = NULL; @@ -189,6 +192,7 @@ else if (!strcmp("--verbose", argv[i])) verbose_mode = TRUE; else if (!strcmp("--simple", argv[i])) simple_mode = TRUE; else if (!strcmp("--noremap", argv[i])) no_remap_mode = TRUE; + else if (!strcmp("--utf8", argv[i])) utf8_text_mode = TRUE; else if (!strcmp("-t", argv[i])) { if ((i + 1) < argc && *argv[i + 1] != '-') diff -u unrtf-0.21.0/src/unicode.c unrtf-0.21.0-patch/src/unicode.c --- unrtf-0.21.0/src/unicode.c 2008-11-09 20:42:33.000000000 +1000 +++ unrtf-0.21.0-patch/src/unicode.c 2009-09-11 18:28:39.000000000 +1000 @@ -6,9 +6,12 @@ *---------------------------------------------------------------------- */ #include +#include #include "malloc.h" +my_iconv_t utf16_desc = MY_ICONV_T_CLEAR; + /*======================================================================== * Name get_unicode * Purpose: Translates unicode character to number. @@ -38,6 +41,37 @@ } /*======================================================================== + * Name unicode_to_static_string + * Purpose: Translates a unicode number to string using iconv. + * Args: Unicode number. + * Returns: Unicode character as string. + *=======================================================================*/ +char * +unicode_to_static_string(int uc) +{ + iconv_t ic; + static char out[7]; + size_t outremaining = 7; + char in[2]; + size_t inremaining = 2; + + char *outpos = out; + char *inpos = in; + + in[0] = ((char*)&uc)[0]; + in[1] = ((char*)&uc)[1]; + + if (!my_iconv_is_valid(utf16_desc)) { + utf16_desc = my_iconv_open("utf8", "utf16"); + } + + my_iconv(utf16_desc, &inpos, &inremaining, &outpos, &outremaining); + + out[7-inremaining] = 0; + return out; +} + +/*======================================================================== * Name unicode_to_string * Purpose: Translates unicode number to character. * Args: Unicode number.