Mercurial > hg > mlmmj
changeset 563:9f6c46b77910
initial revision
author | mortenp |
---|---|
date | Mon, 04 Sep 2006 07:21:38 +1000 |
parents | 0e6215f03447 |
children | 8de4095ec6ea |
files | include/unistr.h listtexts/digest src/unistr.c |
diffstat | 3 files changed, 513 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/include/unistr.h Mon Sep 04 07:21:38 2006 +1000 @@ -0,0 +1,47 @@ +/* Copyright (C) 2005 Morten K. Poulsen <morten at afdelingp.dk> + * + * $Id$ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef UNISTR_H +#define UNISTR_H + +typedef unsigned int unistr_char; + +typedef struct _unistr { + size_t len; + size_t alloc_len; + unistr_char *chars; +} unistr; + +unistr *unistr_new(void); +void unistr_free(unistr *str); +int unistr_cmp(unistr *str1, unistr *str2); +unistr *unistr_dup(unistr *str); +void unistr_append_char(unistr *str, unistr_char uc); +void unistr_append_usascii(unistr *str, char *binary, size_t bin_len); +void unistr_append_utf8(unistr *str, char *binary, size_t bin_len); +void unistr_append_iso88591(unistr *str, char *binary, size_t bin_len); +void unistr_dump(unistr *str); +char *unistr_to_utf8(unistr *str); +char *unistr_header_to_utf8(char *str); + +#endif
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/listtexts/digest Mon Sep 04 07:21:38 2006 +1000 @@ -0,0 +1,5 @@ +Subject: Digest of $listaddr$ issue $digestissue$ ($digestinterval$) + +Topics (messages $digestfirst$ throught $digestlast$): + +$digestthreads$
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/unistr.c Mon Sep 04 07:21:38 2006 +1000 @@ -0,0 +1,461 @@ +/* Copyright (C) 2005 Morten K. Poulsen <morten at afdelingp.dk> + * + * $Id$ + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <unistd.h> +#include <string.h> +#include <stdio.h> +#include <fcntl.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/wait.h> +#include <ctype.h> + +#include "mlmmj.h" +#include "unistr.h" +#include "log_error.h" +#include "memory.h" + + +unistr *unistr_new(void) +{ + unistr *ret; + + ret = mymalloc(sizeof(unistr)); + ret->len = 0; + ret->alloc_len = 64; + ret->chars = mymalloc(ret->alloc_len * sizeof(unistr_char)); + + return ret; +} + + +void unistr_free(unistr *str) +{ + if (!str) + return; + myfree(str->chars); + myfree(str); +} + + +int unistr_cmp(unistr *str1, unistr *str2) +{ + unsigned int i; + + for (i=0; i<str1->len; i++) { + if (str1->chars[i] < str2->chars[i]) { + return -1; + } else if (str1->chars[i] > str2->chars[i]) { + return 1; + } + } + if (str2->len > str1->len) { + return 1; + } + return 0; +} + + +unistr *unistr_dup(unistr *str) +{ + unistr *ret; + unsigned int i; + + ret = unistr_new(); + for (i=0; i<str->len; i++) { + unistr_append_char(ret, str->chars[i]); + } + + return ret; +} + + +void unistr_append_char(unistr *str, unistr_char uc) +{ + if (str->len >= str->alloc_len) { + str->alloc_len *= 2; + str->chars = myrealloc(str->chars, str->alloc_len * sizeof(unistr_char)); + } + str->chars[str->len++] = uc; +} + + +void unistr_append_usascii(unistr *str, char *binary, size_t bin_len) +{ + unsigned int i; + + for (i=0; i<bin_len; i++) { + if ((unsigned char)binary[i] > 0x7F) { + unistr_append_char(str, '?'); + } else { + unistr_append_char(str, (unsigned char)binary[i]); + } + } +} + + +void unistr_append_utf8(unistr *str, char *binary, size_t bin_len) +{ + unsigned int i, j; + unistr_char ch; + unsigned char *bin = (unsigned char *)binary; + + for (i=0; i<bin_len; i++) { + if (bin[i] <= 0x7F) { /* 1 */ + unistr_append_char(str, bin[i]); + } else { + if ((bin[i] & 224) == 192) { /* 2 */ + ch = bin[i] & 31; + j = 1; + } else if ((bin[i] & 240) == 224) { /* 3 */ + ch = bin[i] & 15; + j = 2; + } else if ((bin[i] & 248) == 240) { /* 4 */ + ch = bin[i] & 7; + j = 3; + } else if ((bin[i] & 252) == 248) { /* 5 */ + ch = bin[i] & 3; + j = 4; + } else if ((bin[i] & 254) == 252) { /* 6 */ + ch = bin[i] & 1; + j = 5; + } else { + /* invalid byte sequence */ + unistr_append_char(str, '?'); + continue; + } + if (ch == 0) { + /* invalid encoding, no data bits set in first byte */ + unistr_append_char(str, '?'); + continue; + } + for (;j>0; j--) { + i++; + ch <<= 6; + if ((bin[i] & 192) != 128) { + /* invalid byte sequence */ + ch = '?'; + break; + } + ch |= bin[i] & 63; + } + unistr_append_char(str, ch); + } + } +} + + +void unistr_append_iso88591(unistr *str, char *binary, size_t bin_len) +{ + unsigned int i; + + for (i=0; i<bin_len; i++) { + if (binary[i] == 0x00) { + unistr_append_char(str, '?'); + } else { + unistr_append_char(str, (unsigned char)binary[i]); + } + } +} + + +void unistr_dump(unistr *str) +{ + unsigned int i; + + printf("unistr_dump(%p)\n", (void *)str); + printf(" ->len = %d\n", str->len); + printf(" ->alloc_len = %d\n", str->alloc_len); + printf(" ->chars [ "); + for (i=0; i<str->len; i++) { + if ((str->chars[i] <= 0x7F) && (str->chars[i] != '\n')) { + printf("'%c' ", str->chars[i]); + } else { + printf("0x%02X ", str->chars[i]); + } + } + printf("]\n"); +} + + +char *unistr_to_utf8(unistr *str) +{ + unsigned int i; + size_t len = 0; + char *ret; + char *p; + + for (i=0; i<str->len; i++) { + if (str->chars[i] <= 0x7F) { + len++; + } else if (str->chars[i] <= 0x7FF) { + len += 2; + } else if (str->chars[i] <= 0xFFFF) { + len += 3; + } else if (str->chars[i] <= 0x1FFFFF) { + len += 4; + } else if (str->chars[i] <= 0x3FFFFFF) { + len += 5; + } else if (str->chars[i] <= 0x7FFFFFFF) { + len += 6; + } else { + errno = 0; + log_error(LOG_ARGS, "unistr_to_utf8(): can not utf-8 encode" + "U+%04X", str->chars[i]); + return mystrdup(""); + } + } + len++; /* NUL */ + + ret = mymalloc(len); + p = ret; + + for (i=0; i<str->len; i++) { + if (str->chars[i] <= 0x7F) { /* 1 */ + *(p++) = str->chars[i]; + } else if (str->chars[i] <= 0x7FF) { /* 2 */ + *(p++) = 192 + ((str->chars[i] & 1984) >> 6); + *(p++) = 128 + (str->chars[i] & 63); + } else if (str->chars[i] <= 0xFFFF) { /* 3 */ + *(p++) = 224 + ((str->chars[i] & 61440) >> 12); + *(p++) = 128 + ((str->chars[i] & 4032) >> 6); + *(p++) = 128 + (str->chars[i] & 63); + } else if (str->chars[i] <= 0x1FFFFF) { /* 4 */ + *(p++) = 240 + ((str->chars[i] & 1835008) >> 18); + *(p++) = 128 + ((str->chars[i] & 258048) >> 12); + *(p++) = 128 + ((str->chars[i] & 4032) >> 6); + *(p++) = 128 + (str->chars[i] & 63); + } else if (str->chars[i] <= 0x3FFFFFF) { /* 5 */ + *(p++) = 248 + ((str->chars[i] & 50331648) >> 24); + *(p++) = 128 + ((str->chars[i] & 16515072) >> 18); + *(p++) = 128 + ((str->chars[i] & 258048) >> 12); + *(p++) = 128 + ((str->chars[i] & 4032) >> 6); + *(p++) = 128 + (str->chars[i] & 63); + } else if (str->chars[i] <= 0x7FFFFFFF) { /* 6 */ + *(p++) = 252 + ((str->chars[i] & 1073741824) >> 30); + *(p++) = 128 + ((str->chars[i] & 1056964608) >> 24); + *(p++) = 128 + ((str->chars[i] & 16515072) >> 18); + *(p++) = 128 + ((str->chars[i] & 258048) >> 12); + *(p++) = 128 + ((str->chars[i] & 4032) >> 6); + *(p++) = 128 + (str->chars[i] & 63); + } else { + errno = 0; + log_error(LOG_ARGS, "unistr_to_utf8(): can not utf-8 encode" + "U+%04X", str->chars[i]); + } + } + *(p++) = '\0'; + + return ret; +} + + +static int hexval(char ch) +{ + ch = tolower(ch); + + if ((ch >= 'a') && (ch <= 'f')) { + return 10 + ch - 'a'; + } + + if ((ch >= '0') && (ch <= '9')) { + return ch - '0'; + } + + return 0; +} + + +static void decode_qp(char *str, char **binary, size_t *bin_len) +{ + int i; + + /* decoded string will never be longer, and we don't include a NUL */ + *binary = mymalloc(strlen(str)); + *bin_len = 0; + + for (i=0; str[i]; i++) { + if ((str[i] == '=') && isxdigit(str[i+1]) && isxdigit(str[i+2])) { + (*binary)[(*bin_len)++] = (hexval(str[i+1]) << 4) + hexval(str[i+2]); + i += 2; + } else if (str[i] == '_') { + (*binary)[(*bin_len)++] = 0x20; + } else { + (*binary)[(*bin_len)++] = str[i]; + } + } +} + + +static void decode_base64(char *str, char **binary, size_t *bin_len) +{ + int tab[] = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 62, -1, -1, -1, 63, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, -1, -1, -1, -1, -1, -1, + -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1, + -1, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + size_t len; + unsigned int i; + unsigned int out; + int out_numbits; + int val; + + /* decoded string will never be longer, and we don't include a NUL */ + len = strlen(str); + *binary = mymalloc(len); + *bin_len = 0; + + out = 0; + out_numbits = 0; + for (i=0; i<strlen(str); i++) { + val = tab[(unsigned char)str[i]]; + if (val == -1) + continue; + out <<= 6; + out |= val; + out_numbits += 6; + if (out_numbits >= 8) { + (*binary)[(*bin_len)++] = (out >> (out_numbits - 8)) & 255; + out_numbits -= 8; + } + } +} + + +static void header_decode_word(char *word, unistr *ret) +{ + char *my_word; + char *charset, *encoding, *string, *end; + char *binary; + size_t bin_len; + + + if ((word[0] != '=') || (word[1] != '?')) { + unistr_append_usascii(ret, word, strlen(word)); + return; + } + + my_word = mystrdup(word); + + charset = my_word + 2; + + if ((encoding = strchr(charset, '?')) == NULL) { + /* missing encoding */ + unistr_append_usascii(ret, "???", 3); + myfree(my_word); + return; + } + *(encoding++) = '\0'; + + if ((string = strchr(encoding, '?')) == NULL) { + /* missing string */ + unistr_append_usascii(ret, "???", 3); + myfree(my_word); + return; + } + *(string++) = '\0'; + + if ((end = strchr(string, '?')) == NULL) { + /* missing end */ + unistr_append_usascii(ret, "???", 3); + myfree(my_word); + return; + } + *(end++) = '\0'; + if ((end[0] != '=') || (end[1] != '\0')) { + /* broken end */ + unistr_append_usascii(ret, "???", 3); + myfree(my_word); + return; + } + + if (tolower(encoding[0]) == 'q') { + decode_qp(string, &binary, &bin_len); + } else if (tolower(encoding[0]) == 'b') { + decode_base64(string, &binary, &bin_len); + } else { + /* unknown encoding */ + unistr_append_usascii(ret, "???", 3); + myfree(my_word); + return; + } + + if (strcasecmp(charset, "us-ascii") == 0) { + unistr_append_usascii(ret, binary, bin_len); + } else if (strcasecmp(charset, "utf-8") == 0) { + unistr_append_utf8(ret, binary, bin_len); + } else if (strcasecmp(charset, "iso-8859-1") == 0) { + unistr_append_iso88591(ret, binary, bin_len); + } else { + /* unknown charset */ + unistr_append_usascii(ret, "???", 3); + } + + myfree(my_word); + myfree(binary); +} + + +/* IN: "=?iso-8859-1?Q?hyggem=F8de?= torsdag" + * OUT: "hyggem\xC3\xB8de torsdag" + */ +char *unistr_header_to_utf8(char *str) +{ + char *my_str; + char *word; + char *p; + unistr *us; + char *ret; + + my_str = mystrdup(str); + us = unistr_new(); + + word = strtok_r(my_str, " \t\n", &p); + while (word) { + header_decode_word(word, us); + word = strtok_r(NULL, " \t\n", &p); + if (word) + unistr_append_char(us, ' '); + } + + myfree(my_str); + + ret = unistr_to_utf8(us); + unistr_free(us); + + return ret; +}