# HG changeset patch # User Ben Schmidt # Date 1392157143 -39600 # Node ID 7ed55458d9e33a9dee592d5d74419cd5d85123ee # Parent ef3aff5e62556e9cc0c947a6042f4393b9548520 Use iconv to convert unknown character sets. diff -r ef3aff5e6255 -r 7ed55458d9e3 ChangeLog --- a/ChangeLog Mon Feb 10 14:42:32 2014 +1100 +++ b/ChangeLog Wed Feb 12 09:19:03 2014 +1100 @@ -1,3 +1,4 @@ + o Use iconv to convert unknown character sets o Handle unfolded header lines better o Add a tunable for moderation request lifetime (Timo Boettcher) o Ensure mlmmj-send always honours tunables (e.g. relayhost) diff -r ef3aff5e6255 -r 7ed55458d9e3 configure.ac --- a/configure.ac Mon Feb 10 14:42:32 2014 +1100 +++ b/configure.ac Wed Feb 12 09:19:03 2014 +1100 @@ -14,11 +14,13 @@ # Checks for libraries. AC_CHECK_LIB(socket,socket) AC_CHECK_LIB(nsl,gethostbyname) +AC_CHECK_LIB(iconv,iconv_open) # Checks for header files. AC_HEADER_STDC AC_CHECK_HEADERS([arpa/inet.h fcntl.h netinet/in.h stddef.h stdlib.h string.h]) AC_CHECK_HEADERS([sys/socket.h syslog.h unistd.h time.h]) +AC_CHECK_HEADER([iconv.h]) # Checks for typedefs, structures, and compiler characteristics. AC_C_CONST diff -r ef3aff5e6255 -r 7ed55458d9e3 src/unistr.c --- a/src/unistr.c Mon Feb 10 14:42:32 2014 +1100 +++ b/src/unistr.c Wed Feb 12 09:19:03 2014 +1100 @@ -30,12 +30,16 @@ #include #include #include +#include #include "mlmmj.h" #include "unistr.h" #include "log_error.h" #include "memory.h" +/* This is allocated on the stack, so it can't be too big. */ +#define ICONV_BUFFER_SIZE 160 + unistr *unistr_new(void) { @@ -180,6 +184,48 @@ } +void unistr_append_iconv(unistr *str, char *binary, size_t bin_len, + const char * charset) +{ + char bytes[ICONV_BUFFER_SIZE]; + char * buffer; + size_t bufferleft; + iconv_t cd; + + cd = iconv_open("UTF-8", charset); + if (cd == (iconv_t)-1) { + unistr_append_usascii(str, "???", 3); + return; + } + + while (bin_len > 0) { + buffer = bytes; + bufferleft = ICONV_BUFFER_SIZE; + if (iconv(cd, &binary, &bin_len, &buffer, &bufferleft) == (size_t)-1) { + if (errno == EILSEQ) { + /* illegal sequence; try to recover */ + unistr_append_utf8(str, bytes, ICONV_BUFFER_SIZE - bufferleft); + unistr_append_usascii(str, "?", 1); + bin_len--; + binary++; + continue; + } else if (errno == EINVAL) { + /* incomplete sequence; we're done */ + unistr_append_usascii(str, "?", 1); + break; + } else if (errno != E2BIG) { + /* some other error; abort */ + unistr_append_usascii(str, "???", 1); + break; + } + } + /* success or buffer full */ + unistr_append_utf8(str, bytes, ICONV_BUFFER_SIZE - bufferleft); + } + iconv_close(cd); +} + + void unistr_dump(const unistr *str) { unsigned int i; @@ -421,8 +467,7 @@ } else if (strcasecmp(charset, "iso-8859-1") == 0) { unistr_append_iso88591(ret, binary, bin_len); } else { - /* unknown charset */ - unistr_append_usascii(ret, "???", 3); + unistr_append_iconv(ret, binary, bin_len, charset); } myfree(my_word);