[Top][All Lists]
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Groff] Re: man page encoding
From: |
Werner LEMBERG |
Subject: |
Re: [Groff] Re: man page encoding |
Date: |
Thu, 07 Jul 2005 12:24:34 +0200 (CEST) |
> BTW, attached is the file gpreconv [...]
Since it possible that some mailers don't accept attachments, I resend
it here, directly embedded in the email.
Werner
======================================================================
#define I18N
#include <stdio.h>
#include <errno.h>
#include <unistd.h>
#include <string.h>
#ifdef I18N
#include <locale.h>
#include <langinfo.h>
#include <iconv.h>
#endif /* I18N */
#define MAX_VAR_LEN 100
char *check_encoding_tag(char *);
char *check_encoding_tag_parse_tag(char *, char **, char **);
char *read_file (FILE *);
char *emacs2mime(char *);
char *mime2locale(char *);
void conversion_latin1 (char *, char *);
void conversion_utf8 (char *, char *);
void conversion_cp1047 (char *, char *);
void conversion_iconv (char *, char *);
void help (char *);
typedef struct {
char * from;
char * to;
} encname_t;
encname_t emacs_to_mime[] = {
{"alternativnyj", ""},
{"big5", "Big5"},
{"chinese-big5", "Big5"},
{"chinese-euc", "EUC-CN"},
{"chinese-hz", "HZ-GB-2312"},
{"chinese-iso-7bit", ""},
{"chinese-iso-8bit", ""},
{"chinese-iso-8bit-with-esc", ""},
{"cn-big5", "Big5"},
{"cn-gb-2312", "GB2312"},
{"compound-text", ""},
{"ctext", ""},
{"cyrillic-alternativnyj", ""},
{"cyrillic-iso-8bit", "ISO-8859-5"},
{"cyrillic-iso-8bit-with-esc", ""},
{"cyrillic-koi8", "KOI8-R"},
{"dos", ""},
{"emacs-mule", ""},
{"euc-china", "EUC-CN"},
{"euc-cn", "EUC-CN"},
{"euc-japan", "EUC-JP"},
{"euc-jisx0213", "EUC-JP"},
{"euc-jisx0213-with-esc", "EUC-JP"},
{"euc-jp", "EUC-JP"},
{"euc-korea", "EUC-KR"},
{"euc-kr", "EUC-KR"},
{"gb2312", "GB2312"},
{"greek-iso-8bit", "ISO-8859-7"},
{"greek-iso-8bit-with-esc", "ISO-8859-7"},
{"hebrew-iso-8bit", "ISO-8859-8"},
{"hebrew-iso-8bit-with-esc", "ISO-8859-8"},
{"hz", "HZ-GB-2312"},
{"hz-gb-2312", "HZ-GB-2312"},
{"in-is13194-devanagari", ""},
{"in-is13194-devanagari-with-esc", ""},
{"iso-2022-7", ""},
{"iso-2022-7bit", ""},
{"iso-2022-7bit-lock", ""},
{"iso-2022-7bit-lock-ss2", ""},
{"iso-2022-7bit-ss2", ""},
{"iso-2022-8", ""},
{"iso-2022-8bit", ""},
{"iso-2022-8bit-lock" , ""},
{"iso-2022-8bit-lock-ss2", ""},
{"iso-2022-8bit-ss2", ""},
{"iso-2022-cjk", ""},
{"iso-2022-cn", "ISO-2022-CN"},
{"iso-2022-cn-ext", "ISO-2022-CN-EXT"},
{"iso-2022-int-1", ""},
{"iso-2022-jp", "ISO-2022-JP"},
{"iso-2022-jp-1978-irv", "ISO-2022-JP"},
{"iso-2022-jp-2", "ISO-2022-JP-2"},
{"iso-2022-jp-3", ""},
{"iso-2022-jp-3-compatible", ""},
{"iso-2022-jp-3-strict", ""},
{"iso-2022-kr", "ISO-2022-KR"},
{"iso-2022-lock", ""},
{"iso-8859-1", "ISO-8859-1"},
{"iso-8859-2", "ISO-8859-2"},
{"iso-8859-3", "ISO-8859-3"},
{"iso-8859-4", "ISO-8859-4"},
{"iso-8859-5", "ISO-8859-5"},
{"iso-8859-6", "ISO-8859-6"},
{"iso-8859-7", "ISO-8859-7"},
{"iso-8859-8", "ISO-8859-8"},
{"iso-8859-9", "ISO-8859-9"},
{"iso-latin-1", "ISO-8859-1"},
{"iso-latin-2", "ISO-8859-2"},
{"iso-latin-3", "ISO-8859-3"},
{"iso-latin-4", "ISO-8859-4"},
{"iso-latin-5", "ISO-8859-9"},
{"iso-safe", ""},
{"japanese-iso-7bit-1978-irv", "ISO-2022-JP"},
{"japanese-iso-8bit", ""},
{"japanese-iso-8bit-with-esc", ""},
{"japanese-euc", "EUC-JP"},
{"japanese-shift-jis", "Shift_JIS"},
{"japanese-shift-jisx0213", ""},
{"junet", "ISO-2022-JP"},
{"koi8", "KOI8-R"},
{"koi8-r", "KOI8-R"},
{"korean-euc", "EUC-KR"},
{"korean-iso-7bit-lock", "ISO-2022-KR"},
{"korean-iso-8bit", ""},
{"korean-iso-8bit-with-esc", ""},
{"lao", ""},
{"lao-with-esc", ""},
{"latin-1", "ISO-8859-1"},
{"latin-2", "ISO-8859-2"},
{"latin-3", "ISO-8859-3"},
{"latin-4", "ISO-8859-4"},
{"latin-5", "ISO-8859-9"},
{"mac", ""},
{"old-jis", "ISO-2022-JP"},
{"raw-text", ""},
{"shift_jis", "Shift_JIS"},
{"shift_jisx0213", "Shift_JIS"},
{"sjis", "Shift_JIS"},
{"th-tis620", "TIS-620"},
{"thai-tis620", "TIS-620"},
{"tibetan", ""},
{"tis-620", "TIS-620"},
{"tis620", "TIS-620"},
{"us-ascii", "US-ASCII"},
{"utf-16-be", "UTF-16BE"},
{"utf-16-be-no-signature", "UTF-16BE"},
{"utf-16-le", "UTF-16LE"},
{"utf-16-le-no-signature", "UTF-16LE"},
{"utf-7", "UTF-7"},
{"utf-7-safe", "UTF-7"},
{"utf-8", "UTF-8"},
{"utf-8-ws", "UTF-8"},
{"vietnamese-viqr", "VIQR"},
{"vietnamese-viscii", "VISCII"},
{"vietnamese-vscii", "VISCII"},
{"viqr", "VIQR"},
{"viscii", "VISCII"},
{"vscii", "VSCII"},
{"x-ctext", ""},
{NULL, NULL}
};
encname_t mime_to_locale[] = {
{NULL, NULL}
};
main(int argc, char **argv)
{
char *encoding=NULL, *default_encoding, *inbuf, *locale;
FILE *fp;
/* determine the default encoding. This part has to be located
* before getopt() since the help message shows the default
* encoding.
*/
#ifdef I18N
setlocale(LC_ALL, "");
locale = setlocale(LC_CTYPE, NULL);
if (!locale || !strcmp(locale, "C") || !strcmp(locale, "POSIX")) {
default_encoding = "latin1";
} else {
default_encoding = nl_langinfo(CODESET);
if (!default_encoding) default_encoding = "latin1";
}
#else
default_encoding = "latin1";
#endif
/* parse the command option */
while(1){
int opt;
opt = getopt(argc, argv, "e:h");
if (opt == -1) break;
switch(opt){
case 'e':
encoding = (char *)strdup(optarg); break;
case 'h':
help(default_encoding); exit(0);
default:
exit(1);
}
}
/* read a source */
if (optind < argc) {
fp = fopen(argv[optind], "r");
if (!fp) {
printf("Cannot open %s\n", argv[optind]);
exit(1);
}
inbuf = read_file(fp);
fclose(fp);
} else {
inbuf = read_file(stdin);
}
/* finally determine the encoding */
if (encoding == NULL) {
encoding = check_encoding_tag(inbuf);
if (encoding == NULL) {
encoding = default_encoding;
}
}
/* translate from MIME & Emacs encoding names to locale encoding names */
encoding = emacs2mime(encoding);
encoding = mime2locale(encoding);
/* call converter (converters write to stdout) */
if (!strcasecmp(encoding, "latin1")) {
conversion_latin1(inbuf, encoding);
} else if (!strcasecmp(encoding, "utf8")) {
conversion_utf8(inbuf, encoding);
} else if (!strcasecmp(encoding, "cp1047")) {
conversion_cp1047(inbuf, encoding);
} else {
#ifdef I18N
conversion_iconv(inbuf, encoding);
#else
printf("Conversion from %s to UTF-8 is not supported.\n", encoding);
exit(1);
#endif
}
}
/* ---------------------------------------------------------
* print help message
* ---------------------------------------------------------
*/
void help(char *default_encoding)
{
printf(
"Preprocessor for Groff system (%s)\n"
"Usage: gpreconv [option] [input file]\n"
" -e encoding specify encoding\n"
" -h this message\n"
"The default encoding is \"%s\".\n",
#ifdef I18N
"internationalized version",
#else
"non-internationalized version",
#endif
default_encoding);
}
/* ---------------------------------------------------------
* read input file
* The file has to be inputed as a whole before conversion
* since the encoding may be stateful like ISO-2022 series.
* ---------------------------------------------------------
*/
char *read_file (FILE *fp)
{
#define READBUF_SIZE 32768
char *buf = NULL;
size_t bufsize = 0;
size_t readsize = 0;
size_t n;
while(1) {
if (readsize == bufsize) {
bufsize += READBUF_SIZE;
buf = (char *)realloc(buf, bufsize + 1);
if (!buf) {
printf("Unable to allocate memory.\n"); exit(1);
}
}
n = fread (buf+readsize, 1, bufsize-readsize, fp);
readsize += n;
if (feof(fp)) break;
if (ferror(fp)) {
printf("Read error.\n"); exit(1);
}
}
buf[readsize] = 0;
return buf;
}
/* ---------------------------------------------------------
* check encoding tag in the read buffer (not implemented yet)
* ---------------------------------------------------------
*/
char *check_encoding_tag(char *inbuf)
{
char *p, *lineend, *d1, *d2, *variable, *value;
for (p=inbuf ; !strncmp(p, ".\\\"", 3) ; p = lineend + 1) {
if ((lineend = strchr(p, '\n')) == NULL) break;
*lineend = 0;
d1 = strstr(p, "-*-"); if (d1) d2 = strstr(d1+3, "-*-");
*lineend = '\n';
if (!d1 || !d2) continue;
*d2 = 0; d1+=3;
while(*d1) {
d1 = check_encoding_tag_parse_tag(d1, &variable, &value);
if (!strcasecmp(variable, "coding")) {
*d2 = '-';
return value;
}
}
*d2 = '-';
}
return NULL;
}
char *check_encoding_tag_parse_tag(char *d1, char **variable, char **value)
{
static char var[MAX_VAR_LEN], val[MAX_VAR_LEN];
int l;
*variable = var; *value = val;
while (*d1 == ' ' || *d1 == '\t') d1++;
l = 0;
while (l<MAX_VAR_LEN-1 && *d1 && !strchr(";: \t", *d1)) {
var[l++] = *(d1++);
}
var[l] = 0;
while (*d1 && *d1!=':' && *d1!=';') d1++;
val[0] = 0;
if (!*d1) return d1;
if (*d1 == ';') return d1+1;
d1++;
while (*d1 == ' ' || *d1 == '\t') d1++;
l = 0;
while (l<MAX_VAR_LEN-1 && *d1 && !strchr("; \t", *d1)) {
val[l++] = *(d1++);
}
val[l] = 0;
while (*d1 && *d1!=';') d1++;
if (!*d1) return d1;
if (*d1 == ';') return d1+1;
}
/* ---------------------------------------------------------
* convert encoding name from emacs to mime
* ---------------------------------------------------------
*/
char *emacs2mime(char *emacs_encoding)
{
static char emacs_enc[MAX_VAR_LEN];
int emacs_enc_len;
encname_t *table;
strncpy(emacs_enc, emacs_encoding, MAX_VAR_LEN-1);
emacs_enc[MAX_VAR_LEN-1] = 0;
emacs_enc_len = strlen(emacs_enc);
if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-dos"))
emacs_enc[emacs_enc_len - 4] = 0;
if (!strcasecmp(emacs_enc + emacs_enc_len - 4, "-mac"))
emacs_enc[emacs_enc_len - 4] = 0;
if (!strcasecmp(emacs_enc + emacs_enc_len - 5, "-unix"))
emacs_enc[emacs_enc_len - 5] = 0;
for (table = emacs_to_mime; table->from; table++) {
if (!strcasecmp(emacs_enc, table->from)) return table->to;
}
return emacs_enc;
}
/* ---------------------------------------------------------
* convert encoding name from mime to locale
* ---------------------------------------------------------
*/
char *mime2locale(char *mime_encoding)
{
encname_t *table;
for (table = mime_to_locale; table->from; table++) {
if (!strcasecmp(mime_encoding, table->from)) return table->to;
}
return mime_encoding;
}
/* ---------------------------------------------------------
* conversion functions
* ---------------------------------------------------------
*/
/* conversion from ISO-8859-1 (aka Latin-1) to UTF-8 */
void conversion_latin1 (char *inbuf, char *encoding)
{
unsigned char *p;
for(p=inbuf; *p; p++) {
if (*p < 0x80) putchar(*p);
else {putchar(0xc0 + (*p >> 6)); putchar(0x80 + (*p & 0x3f));}
}
return;
}
/* conversion from UTF-8 to UTF-8, i.e., do nothing */
void conversion_utf8 (char *inbuf, char *encoding)
{
fwrite(inbuf, 1, strlen(inbuf), stdout);
return;
}
/* conversion from CP1047 (EBCDIC) to UTF-8 */
/* the table is made from /font/devcp1047/R.proto in groff 1.16 */
void conversion_cp1047 (char *inbuf, char *encoding)
{
static unsigned char cp1047[] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x00-0x07 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x08-0x0f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x10-0x17 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x18-0x1f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x20-0x27 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x28-0x2f */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x30-0x37 */
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0x38-0x3f */
0x00, 0x00, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5, /* 0x40-0x47 */
0xe7, 0xf1, 0xa2, '.' , '<' , '(' , '+' , '|' , /* 0x48-0x4f */
'&' , 0xe8, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef, /* 0x50-0x57 */
0xec, 0xdf, '!' , '$' , '*' , ')' , ';' , '^' , /* 0x58-0x5f */
'-' , '/' , 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5, /* 0x60-0x67 */
0xc7, 0xd1, 0xa6, ',' , '%' , '_' , '>' , '?' , /* 0x68-0x6f */
0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf, /* 0x70-0x77 */
0xcc, '`' , ':' , '#' , '@' , '\'', '=' , '\"', /* 0x78-0x7f */
0xd8, 'a' , 'b' , 'c' , 'd' , 'e' , 'f' , 'g' , /* 0x80-0x87 */
'h' , 'i' , 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1, /* 0x88-0x8f */
0xb0, 'j' , 'k' , 'l' , 'm' , 'n' , 'o' , 'p' , /* 0x90-0x97 */
'q' , 'r' , 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4, /* 0x98-0x9f */
0xb5, '~' , 's' , 't' , 'u' , 'v' , 'w' , 'x' , /* 0xa0-0xa7 */
'y' , 'z' , 0xa1, 0xbf, 0xd0, '[' , 0xde, 0xae, /* 0xa8-0xaf */
0xac, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc, /* 0xb0-0xb7 */
0xbd, 0xbe, 0xdd, 0xa8, 0xaf, ']' , 0xb4, 0xd7, /* 0xb8-0xbf */
'{' , 'A' , 'B' , 'C' , 'D' , 'E' , 'F' , 'G' , /* 0xc0-0xc7 */
'H' , 'I' , 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5, /* 0xc8-0xcf */
'}' , 'J' , 'K' , 'L' , 'M' , 'N' , 'O' , 'P' , /* 0xd0-0xd7 */
'Q' , 'R' , 0xb9, 0xfb, 0xfc, 0xf9, 0xfa, 0xff, /* 0xd8-0xdf */
'\\', 0xf7, 'S' , 'T' , 'U' , 'V' , 'W' , 'X' , /* 0xe0-0xe7 */
'Y' , 'Z' , 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5, /* 0xe8-0xef */
'0' , '1' , '2' , '3' , '4' , '5' , '6' , '7' , /* 0xf0-0xf7 */
'8' , '9' , 0xb3, 0xdb, 0xdc, 0xd9, 0xda, 0x00 /* 0xf8-0xff */
};
unsigned char *p, c;
for(p=inbuf; *p; p++) {
c = cp1047[*p];
if (c == 0) c = *p; /* fail safe */
if (c < 0x80) putchar(c);
else {
putchar(0xc0 + (c >> 6));
putchar(0x80 + (c & 0x3f));
}
}
return;
}
/* locale-sensible conversion */
#ifdef I18N
void conversion_iconv (char *inbuf, char *encoding)
{
#define OUTBUF_SIZE 32768
#define OUTBUF_LIMIT 10
char *outbuf = NULL, *outbuf_top = NULL;
size_t bufsize = 0;
size_t inbytesleft, outbytesleft, status;
iconv_t handle;
handle = iconv_open("UTF-8", encoding);
if (handle == (iconv_t)-1) {
if (errno == EINVAL) {
printf("Conversion from %s to UTF-8 is not supported.\n", encoding);
exit(1);
}
printf("iconv_open failed!\n"); exit(1);
}
inbytesleft = strlen(inbuf) + 1;
outbytesleft = 0;
while (inbytesleft > 0) {
if (outbytesleft < OUTBUF_LIMIT) {
size_t outsize = outbuf - outbuf_top;
outbuf_top = (char *)realloc(outbuf_top, bufsize+=OUTBUF_SIZE);
if (!outbuf_top) {
printf("Unable to allocate memory.\n"); exit(1);
}
outbuf = outbuf_top + outsize;
outbytesleft += OUTBUF_SIZE;
}
status = iconv(handle, &inbuf, &inbytesleft, &outbuf, &outbytesleft);
if (status == -1) {
if (errno == EINVAL || errno == EILSEQ) {
printf("Invalid character.\n"); exit(1);
}
}
}
fwrite(outbuf_top, 1, strlen(outbuf_top), stdout);
return;
}
#endif
- [Groff] Re: man page encoding, (continued)
- [Groff] Re: man page encoding, Andries Brouwer, 2005/07/06
- [Groff] Re: man page encoding, Bruno Haible, 2005/07/07
- [Groff] Re: man page encoding, Andries Brouwer, 2005/07/07
- Re: [Groff] Re: man page encoding, Werner LEMBERG, 2005/07/07
- [Groff] Re: man page encoding, Bruno Haible, 2005/07/07
- Re: [Groff] Re: man page encoding, Zvezdan Petkovic, 2005/07/07
- Re: [Groff] Re: man page encoding, Andries Brouwer, 2005/07/07
- Re: [Groff] Re: man page encoding, Werner LEMBERG, 2005/07/07
- Re: [Groff] Re: man page encoding, Clarke Echols, 2005/07/07
- Re: [Groff] Re: man page encoding, Werner LEMBERG, 2005/07/07
- Re: [Groff] Re: man page encoding,
Werner LEMBERG <=
- Re: [Groff] Re: man page encoding, Bruno Haible, 2005/07/07
- Re: [Groff] Re: man page encoding, Andries Brouwer, 2005/07/07
- Re: [Groff] Re: man page encoding, Bruno Haible, 2005/07/07
- Re: [Groff] Re: man page encoding, Andries Brouwer, 2005/07/08
- Re: [Groff] Re: man page encoding, Bruno Haible, 2005/07/08
- Re: [Groff] Re: man page encoding, Andries Brouwer, 2005/07/08
- Re: [Groff] Re: man page encoding, Werner LEMBERG, 2005/07/08
- Re: [Groff] Re: man page encoding, Bruno Haible, 2005/07/08
- Re: [Groff] Re: man page encoding, Werner LEMBERG, 2005/07/08
- Re: [Groff] Re: man page encoding, Zvezdan Petkovic, 2005/07/07