-
Notifications
You must be signed in to change notification settings - Fork 82
Expand file tree
/
Copy pathreadstat_convert.c
More file actions
143 lines (120 loc) · 5.15 KB
/
readstat_convert.c
File metadata and controls
143 lines (120 loc) · 5.15 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#include <errno.h>
#include "readstat.h"
#include "readstat_iconv.h"
#include "readstat_convert.h"
readstat_error_t readstat_convert(char *dst, size_t dst_len, const char *src, size_t src_len, iconv_t converter) {
/* strip off spaces from the input because the programs use ASCII space
* padding even with non-ASCII encoding. */
while (src_len && src[src_len-1] == ' ') {
src_len--;
}
if (dst_len == 0) {
return READSTAT_ERROR_CONVERT_LONG_STRING;
} else if (converter) {
size_t dst_left = dst_len - 1;
char *dst_end = dst;
size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left);
if (status == (size_t)-1) {
if (errno == E2BIG) { /* E2BIG indicates that the output buffer is not large enough */
return READSTAT_ERROR_CONVERT_LONG_STRING;
} else if (errno == EILSEQ) { /* EILSEQ indicates an invalid multibyte sequence */
return READSTAT_ERROR_CONVERT_BAD_STRING;
} else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */
return READSTAT_ERROR_CONVERT;
}
}
dst[dst_len - dst_left - 1] = '\0';
} else if (src_len + 1 > dst_len) {
return READSTAT_ERROR_CONVERT_LONG_STRING;
} else {
memcpy(dst, src, src_len);
dst[src_len] = '\0';
}
return READSTAT_OK;
}
int readstat_invalid_string_info(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* show information about the invalid string and exit */
printf("Invalid string in variable %s, row %d: \"%s\"\n", variable->name, obs_index, src);
return READSTAT_HANDLER_ABORT;
}
int readstat_invalid_string_copy(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* copy over the string unedited and continue */
/* strip off spaces from the input because the programs use ASCII space
* padding even with non-ASCII encoding. */
while (src_len && src[src_len-1] == ' ') {
src_len--;
}
if (src_len + 1 > dst_len) {
return READSTAT_HANDLER_ABORT;
}
memcpy(dst, src, src_len);
dst[src_len] = '\0';
return READSTAT_HANDLER_OK;
}
int readstat_invalid_string_skip(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* skip the invalid string */
dst[0] = '\0';
return READSTAT_HANDLER_OK;
}
int readstat_invalid_string_utf8(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* treat string as utf-8 and use the unicode replacement character for any invalid bytes */
/* strip off spaces from the input because the programs use ASCII space
* padding even with non-ASCII encoding. */
while (src_len && src[src_len-1] == ' ') {
src_len--;
}
iconv_t converter = iconv_open("UTF-8", "UTF-8");
if (converter == (iconv_t)-1) {
return READSTAT_HANDLER_ABORT;
}
size_t dst_left = dst_len - 1;
char *dst_end = dst;
size_t src_left = src_len;
const char *src_end = src;
while (src_left > 0) {
size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src_end, &src_left, &dst_end, &dst_left);
if (status == (size_t)-1) {
if (errno == E2BIG) { /* E2BIG indicates that the output buffer is not large enough */
return READSTAT_HANDLER_ABORT;
} else if (errno == EILSEQ) { /* EILSEQ indicates an invalid multibyte sequence */
if (dst_left < 3) {
return READSTAT_HANDLER_ABORT;
}
dst_end[0] = (char) 0xEF;
dst_end[1] = (char) 0xBF;
dst_end[2] = (char) 0xBD;
dst_end += 3;
src_end += 1;
dst_left -= 3;
src_left -= 1;
} else if (errno != EINVAL) { /* EINVAL indicates improper truncation; accept it */
return READSTAT_HANDLER_ABORT;
} else {
/* finish here and accept conversion if EINVAL is returned */
break;
}
}
}
dst[dst_len - dst_left - 1] = '\0';
iconv_close(converter);
return READSTAT_HANDLER_OK;
}
int readstat_invalid_string_cp1252(char *dst, size_t dst_len, const char *src, size_t src_len, int obs_index, readstat_variable_t *variable, void *ctx) {
/* try converting the rest of the string as WINDOWS-1252, common encoding error */
while (src_len && src[src_len-1] == ' ') {
src_len--;
}
iconv_t converter = iconv_open("UTF-8", "WINDOWS-1252");
if (converter == (iconv_t)-1) {
return READSTAT_HANDLER_ABORT;
}
size_t dst_left = dst_len - 1;
char *dst_end = dst;
size_t status = iconv(converter, (readstat_iconv_inbuf_t)&src, &src_len, &dst_end, &dst_left);
if (status == (size_t)-1) {
return READSTAT_HANDLER_ABORT;
}
dst[dst_len - dst_left - 1] = '\0';
iconv_close(converter);
return READSTAT_HANDLER_OK;
}