diff -rwuN perl5.003_07.dist/MANIFEST perl5.003_07/MANIFEST --- perl5.003_07.dist/MANIFEST Thu Oct 10 20:59:23 1996 +++ perl5.003_07/MANIFEST Fri Nov 15 22:59:36 1996 @@ -302,6 +302,7 @@ lib/I18N/Collate.pm Routines to do strxfrm-based collation lib/IPC/Open2.pm Open a two-ended pipe lib/IPC/Open3.pm Open a three-ended pipe! +lib/Locale/Collate.pm Control locale collation lib/Math/BigFloat.pm An arbitrary precision floating-point arithmetic package lib/Math/BigInt.pm An arbitrary precision integer arithmetic package lib/Math/Complex.pm A Complex package diff -rwuN perl5.003_07.dist/ext/POSIX/POSIX.xs perl5.003_07/ext/POSIX/POSIX.xs --- perl5.003_07.dist/ext/POSIX/POSIX.xs Mon Oct 7 22:33:09 1996 +++ perl5.003_07/ext/POSIX/POSIX.xs Sat Nov 16 00:45:23 1996 @@ -2724,8 +2724,49 @@ char * locale CODE: RETVAL = setlocale(category, locale); - if (RETVAL) + if (RETVAL) { +#ifdef LC_CTYPE + if (category == LC_CTYPE || +#ifdef LC_ALL + category == LC_ALL || +#endif + 0) perl_init_fold(); +#endif /* LC_CTYPE */ +#ifdef LC_COLLATE + if (category == LC_COLLATE || +#ifdef LC_ALL + category == LC_ALL || +#endif + 0) { + char * collationpvnow = setlocale(LC_COLLATE, NULL); + if (collationpvnow) { + int neq = 0; + if (collationpv && (neq = !strEQ(collationpv, collationpvnow))) + Safefree(collationpv); + /* Bump the "collation generation index" forward only + * iff and whenn the collation locale changes. */ + if (neq && (collationpv = savepv(collationpvnow))) + collationix++; + } +#ifdef HAS_STRXFRM + /* 3: at most so many chars ('a', 'b', '\0' ). */ + /* 50: surely no system expands one char to more than that. */ +#define XFRMBUFSIZE (2 * 50) + { + char xbuf[XFRMBUFSIZE]; + Size_t fa = strxfrm(xbuf, "a", XFRMBUFSIZE); + Size_t fb = strxfrm(xbuf, "ab", XFRMBUFSIZE); + collxfrm_xf = fb - fa; + if (collxfrm_xf < 1) + croak("strxfrm() gets absurd"); + } +#else + /* collxfrm_xf makes no sense. */ +#endif /* HAS_STRXFRM */ +#endif /* LC_COLLATE */ + } + } OUTPUT: RETVAL diff -rwuN perl5.003_07.dist/global.sym perl5.003_07/global.sym --- perl5.003_07.dist/global.sym Mon Oct 7 22:33:09 1996 +++ perl5.003_07/global.sym Sat Nov 16 00:02:35 1996 @@ -23,6 +23,9 @@ bufptr bxor_amg check +collationix +collationpv +collxfrm_xf compiling compl_amg compcv @@ -77,7 +80,6 @@ last_lop last_lop_op last_uni -lc_collate_active le_amg lex_state lex_defer @@ -239,6 +241,7 @@ vtbl_amagicelem vtbl_arylen vtbl_bm +vtbl_collxfrm vtbl_dbline vtbl_env vtbl_envelem @@ -340,6 +343,7 @@ ck_subr ck_svconst ck_trunc +collating_by_locale convert cpytill croak @@ -484,6 +488,7 @@ magic_setamagic magic_setarylen magic_setbm +magic_setcollxfrm magic_setdbline magic_setenv magic_setglob @@ -1029,6 +1034,7 @@ sv_clean_objs sv_clear sv_cmp +sv_collxfrm sv_dec sv_dump sv_eq diff -rwuN perl5.003_07.dist/lib/I18N/Collate.pm perl5.003_07/lib/I18N/Collate.pm --- perl5.003_07.dist/lib/I18N/Collate.pm Mon Oct 7 22:33:10 1996 +++ perl5.003_07/lib/I18N/Collate.pm Fri Nov 15 22:07:26 1996 @@ -126,11 +126,8 @@ migrate the old applications away from it) because its functionality was integrated into the Perl core language in the release 5.003_06. - All scalar data is now collated according to the current locale setting. - Also, Perl does automatically the setlocale(LC_COLLATE, "") for you. - - To convert: forget I18N::Collate completely and use scalar data in - a completely normal way. + All scalar data can now be collated according to the current locale + setting. Please consult the perli18n documentation for the details. *** ___EOD___ diff -rwuN perl5.003_07.dist/lib/Locale/Collate.pm perl5.003_07/lib/Locale/Collate.pm --- perl5.003_07.dist/lib/Locale/Collate.pm Thu Jan 1 02:00:00 1970 +++ perl5.003_07/lib/Locale/Collate.pm Fri Nov 15 22:07:26 1996 @@ -0,0 +1,118 @@ +package Locale::Collate; + +=head1 NAME + +Locate::Collate - control the collation (sorting) style + +=head1 SYNOPIS + + use Locale::Collate; + + # return the current collation style + Locale::Collate->compare; + + # change the collation style to 'locale' + Locale::Collate->compare('locale'); + + # change the collation style to the default + Locale::Collate->compare('default'); + +=head1 DESCRIPTION + +Normally collating (sorting, comparing) data is done according +to the default locale, C<"C">. That is, according to the numerical +order of the ISO Latin 1 (eight bits) and ASCII (seven bits). + +With the C the sorting style can be changed to +B<'locale'>. With this sorting style collating data is done according +to the collation rules of the current locale. + +B: just setting the collation style to B<'locale'> will not +change anything unless also the sorting locale is changed using the +C) or the C calls. + + use POSIX qw(setlocale LC_COLLATE); + use Locale::Collate; + + setlocale(LC_COLLATE, ""); + Locale::Collate->compare("locale"); +=cut + +use vars qw($_compare $_compare_default); + +$_compare_default = 'default'; + +sub compare { + my ($class, $compare) = @_; + + if (defined $compare) { + $_compare = $compare; + } else { + $_compare = $_compare_default; + } + + $_compare = $_compare_default unless (defined $_compare); + + $_compare; +} + +1; +package Locale::Collate; + +=head1 NAME + +Locate::Collate - control the collation (sorting) style + +=head1 SYNOPIS + + use Locale::Collate; + + # return the current collation style + Locale::Collate->compare; + + # change the collation style to 'locale' + Locale::Collate->compare('locale'); + + # change the collation style to the default + Locale::Collate->compare('default'); + +=head1 DESCRIPTION + +Normally collating (sorting, comparing) data is done according +to the default locale, C<"C">. That is, according to the numerical +order of the ISO Latin 1 (eight bits) and ASCII (seven bits). + +With the C the sorting style can be changed to +B<'locale'>. With this sorting style collating data is done according +to the collation rules of the current locale. + +B: just setting the collation style to B<'locale'> will not +change anything unless also the sorting locale is changed using the +C) or the C calls. + + use POSIX qw(setlocale LC_COLLATE); + use Locale::Collate; + + setlocale(LC_COLLATE, ""); + Locale::Collate->compare("locale"); +=cut + +use vars qw($_compare $_compare_default); + +$_compare_default = 'default'; + +sub compare { + my ($class, $compare) = @_; + + if (defined $compare) { + $_compare = $compare; + } else { + $_compare = $_compare_default; + } + + $_compare = $_compare_default unless (defined $_compare); + + $_compare; +} + +1; diff -rwuN perl5.003_07.dist/mg.c perl5.003_07/mg.c --- perl5.003_07.dist/mg.c Thu Oct 10 21:38:03 1996 +++ perl5.003_07/mg.c Fri Nov 15 22:52:33 1996 @@ -1173,6 +1173,18 @@ } int +magic_setcollxfrm(sv,mg) +SV* sv; +MAGIC* mg; +{ + /* René Descartes said "I think not." + * and vanished with a faint plop. */ + sv_unmagic(sv, 'o'); + + return 0; +} + +int magic_set(sv,mg) SV* sv; MAGIC* mg; diff -rwuN perl5.003_07.dist/perl.c perl5.003_07/perl.c --- perl5.003_07.dist/perl.c Thu Oct 10 21:48:51 1996 +++ perl5.003_07/perl.c Fri Nov 15 22:06:33 1996 @@ -2121,7 +2121,6 @@ tainted = 0; if (tmpgv = gv_fetchpv("$",TRUE, SVt_PV)) sv_setiv(GvSV(tmpgv),(I32)getpid()); - } static void diff -rwuN perl5.003_07.dist/perl.h perl5.003_07/perl.h --- perl5.003_07.dist/perl.h Wed Oct 9 18:12:08 1996 +++ perl5.003_07/perl.h Sat Nov 16 00:01:39 1996 @@ -179,8 +179,6 @@ #include #endif -EXT int lc_collate_active; - #ifdef METHOD /* Defined by OSF/1 v3.0 by ctype.h */ #undef METHOD #endif @@ -1795,6 +1793,9 @@ EXT MGVTBL vtbl_uvar = {magic_getuvar, magic_setuvar, 0, 0, 0}; +EXT MGVTBL vtbl_collxfrm = {0, + magic_setcollxfrm, + 0, 0, 0}; #ifdef OVERLOAD EXT MGVTBL vtbl_amagic = {0, magic_setamagic, @@ -1824,6 +1825,7 @@ EXT MGVTBL vtbl_pos; EXT MGVTBL vtbl_bm; EXT MGVTBL vtbl_uvar; +EXT MGVTBL vtbl_collxfrm; #ifdef OVERLOAD EXT MGVTBL vtbl_amagic; @@ -1925,5 +1927,13 @@ #define printf PerlIO_stdoutf #endif -#endif /* Include guard */ +#ifdef LC_COLLATE +/* The "locale collation generation index". */ +EXT IV collationix; +/* The name of the current locale collation. */ +EXT char * collationpv; +/* The expansion factor in collation *xfrm() */ +EXT int collxfrm_xf; +#endif +#endif /* Include guard */ diff -rwuN perl5.003_07.dist/perl_exp.SH perl5.003_07/perl_exp.SH --- perl5.003_07.dist/perl_exp.SH Mon Oct 7 22:27:35 1996 +++ perl5.003_07/perl_exp.SH Fri Nov 15 22:07:41 1996 @@ -29,6 +29,7 @@ perl_init_ext perl_init_fold perl_init_i18nl14n +perl_init_i18nl10n perl_alloc perl_construct perl_destruct diff -rwuN perl5.003_07.dist/pod/perlguts.pod perl5.003_07/pod/perlguts.pod --- perl5.003_07.dist/pod/perlguts.pod Thu Oct 3 21:05:42 1996 +++ perl5.003_07/pod/perlguts.pod Fri Nov 15 22:07:41 1996 @@ -677,6 +677,7 @@ i vtbl_isaelem @ISA array element L 0 (but sets RMAGICAL) Perl Module/Debugger??? l vtbl_dbline Debugger? + o vtbl_collxfrm Locale Collation P vtbl_pack Tied Array or Hash p vtbl_packelem Tied Array or Hash element q vtbl_packelem Tied Scalar or Handle diff -rwuN perl5.003_07.dist/pod/perli18n.pod perl5.003_07/pod/perli18n.pod --- perl5.003_07.dist/pod/perli18n.pod Thu Oct 10 20:07:32 1996 +++ perl5.003_07/pod/perli18n.pod Sat Nov 16 18:57:52 1996 @@ -8,7 +8,7 @@ "is this a letter" and "which letter comes first". These are very important issues especially for languages other than English -- but also for English: it would be very -naïve indeed to think that C defines all the letters. +naïve indeed to think that C defines all the "letters". Perl understands the language-specific data via the standardized (ISO C, XPG4, POSIX 1.c) method called "the locale system". @@ -33,10 +33,10 @@ $old_locale = setlocale(LC_CTYPE); setlocale(LC_CTYPE, "fr_CA.ISO8859-1"); - # for LC_CTYPE now in locale "French, Canada, codeset ISO 8859-1" + # LC_CTYPE now in locale "French, Canada, codeset ISO 8859-1" setlocale(LC_CTYPE, ""); - # for LC_CTYPE now in locale what the LC_ALL / LC_CTYPE / LANG define. + # LC_CTYPE now in locale what the LC_ALL / LC_CTYPE / LANG define. # see below for documentation about the LC_ALL / LC_CTYPE / LANG. # restore the old locale @@ -78,7 +78,7 @@ Sadly enough even if the calling interface has been standardized the names of the locales are not. The naming usually is -language-country/territory-codeset but the latter parts may +language_country/territory.codeset but the latter parts may not be present. Two special locales are worth special mention: "C" @@ -111,17 +111,55 @@ =head2 Category LC_COLLATE: COLLATION -Starting from Perl version 5.003_06 perl has obeyed the B -environment variable which controls application's notions on the -collation (ordering) of the characters. C does in most Latin -alphabets follow the C but where do the C<Á> and C<Ä> belong? +Starting from Perl version 5.003_06 perl has been able to obey the +B environment variable which controls application's +notions on the collation (ordering) of the characters. C does in +most Latin alphabets follow the C but where do the C<Á> and C<Ä> +belong? + +B: comparing and sorting by locale is usually slower than the +default sorting, factors of 2 to 4 have been observed. It will also +consume more memory: while a Perl scalar variable is participating in +any string comparison or sorting operation and obeying the locale +collation rules it will take about 3-15 (the exact value depends on +the operating system) times more memory than normally. These downsides +are dictated more by the operating system implementation of the locale +system than by Perl. Here is a code snippet that will tell you what are the alphanumeric characters in the current locale, in the locale order: - perl -le 'print sort grep /\w/, map { chr() } 0..255' + use POSIX qw(setlocale LC_COLLATE); + use Locale::Collate; + + setlocale(LC_COLLATE, ""); + Locale::Collate->compare("locale"); + + print +(sort grep /\w/, map { chr() } 0..255), "\n"; As noted above, this will work only for Perl versions 5.003_06 and up. +The C<"locale"> argument for the above Locale::Collate->compare() +tells the comparison functions (lt, le, eq, cmp, ne, ge, gt, sort) to +obey the C. The default behavior can be restored with + + Locale::Collate->compare("default"); + +The default collation must be used for example for sorting raw binary +data whereas the locale collation is useful for natural text. + +B: in some locales some characters may have no collation value +at all -- this means for example if the C<'-'> is such a character the +C and C may sort to the same place. + +B: for certain environments the locale support by the operating +system is very simply broken and cannot be used or fixed by Perl. Such +deficiencies can and will result in mysterious hangs and/or Perl core +dumps. One such example is IRIX before the release 6.2, the C +support simply does not work. When confronted with such systems, +please report in excruciating detail to C, +complain to your vendor, maybe some bug fixes exist for your operating +system for these problems? Sometimes such bug fixes are called +an operating system upgrade. B: in the pre-5.003_06 Perl releases the per-locale collation was possible using the C library module. This is now @@ -172,7 +210,7 @@ C controls the collation of characters, see above. If this is unset and the C is set, the C is used as -the C. If both this and the C are unset but the +the C. If both this and the C are unset but the C is set, the C is used as the C. If none of these three is set, the default locale C<"C"> is used as the C. @@ -187,4 +225,6 @@ There are further locale-controlling environment variables (C) but Perl -B currently obey them. +B currently use them. + +=cut diff -rwuN perl5.003_07.dist/pp_ctl.c perl5.003_07/pp_ctl.c --- perl5.003_07.dist/pp_ctl.c Mon Oct 7 22:33:11 1996 +++ perl5.003_07/pp_ctl.c Sat Nov 16 18:55:26 1996 @@ -32,6 +32,7 @@ static I32 dopoptosub _((I32 startingblock)); static void save_lines _((AV *array, SV *sv)); static int sortcmp _((const void *, const void *)); +static int sortcmplocale _((const void *, const void *)); static int sortcv _((const void *, const void *)); static I32 sortcxix; @@ -649,6 +650,9 @@ else { if (max > 1) { MEXTEND(SP, 20); /* Can't afford stack realloc on signal. */ + if (collating_by_locale()) + qsort((char*)(ORIGMARK+1), max, sizeof(SV*), sortcmplocale); + else qsort((char*)(ORIGMARK+1), max, sizeof(SV*), sortcmp); } } @@ -1184,6 +1188,29 @@ return result; } +#define SORTCMP() \ + cur1 = SvCUR(str1); \ + cur2 = SvCUR(str2); \ + pvx1 = SvPVX(str1); \ + pvx2 = SvPVX(str2); \ + \ + if (cur1 < cur2) { \ + if ((retval = memcmp((void *)pvx1, (void *)pvx2, cur1))) \ + return retval; \ + else \ + return -1; \ + } \ + else if ((retval = memcmp((void *)pvx1, (void *)pvx2, cur2))) \ + return retval; \ + else if (cur1 == cur2) \ + return 0; \ + else \ + return 1; \ + +/* NOTE! This is the sister of sortcmplocale(). + * Remember to modify both if you modify either. */ +/* Also note that both the sortcmp() and sortcmplocale() + * bear a strong resemblance to sv_cmp(). */ static int sortcmp(a, b) const void *a; @@ -1191,61 +1218,71 @@ { register SV *str1 = *(SV **) a; register SV *str2 = *(SV **) b; + register STRLEN cur1; + register STRLEN cur2; + register char * pvx1; + register char * pvx2; I32 retval; - if (!SvPOKp(str1)) { + if (!SvPOKp(str1)) + return SvPOKp(str2) ? -1 : 0; if (!SvPOKp(str2)) - return 0; - else - return -1; + return 1; + + SORTCMP(); } + +/* NOTE! This is the sister of sortcmp(). + * Remember to modify both if you modify either. */ +/* Also note that both the sortcmp() and sortcmplocale() + * bear a strong resemblance to sv_cmp(). */ +static int +sortcmplocale(a, b) +const void *a; +const void *b; +{ + register SV *str1 = *(SV **) a; + register SV *str2 = *(SV **) b; + register char * pvx1, * pvx2; + STRLEN cur1, cur2; + I32 retval; + + if (!SvPOKp(str1)) + return SvPOKp(str2) ? -1 : 0; if (!SvPOKp(str2)) return 1; - if (lc_collate_active) { /* NOTE: this is the LC_COLLATE branch */ - register char * pv1, * pv2, * pvx; - STRLEN cur1, cur2, curx; - - pv1 = SvPV(str1, cur1); - pvx = mem_collxfrm(pv1, cur1, &curx); - pv1 = pvx; - cur1 = curx; - - pv2 = SvPV(str2, cur2); - pvx = mem_collxfrm(pv2, cur2, &curx); - pv2 = pvx; - cur2 = curx; +#ifdef LC_COLLATE - retval = memcmp((void *)pv1, (void *)pv2, cur1 < cur2 ? cur1 : cur2); + pvx1 = sv_collxfrm(str1, &cur1); + pvx2 = sv_collxfrm(str2, &cur2); - Safefree(pv1); - Safefree(pv2); + if (!pvx1 || !cur1) { + if (!pvx2 || !cur2) + goto do_memcmp; + return -1; + } + + if (!pvx2 || !cur2) + return 1; + + retval = memcmp((void *)pvx1, (void *)pvx2, cur1 < cur2 ? cur1 : cur2); if (retval) - return retval < 0 ? -1 : 1; + return retval; - if (cur1 == cur2) - return 0; - else - return cur1 < cur2 ? -1 : 1; - } +do_memcmp: - /* NOTE: this is the non-LC_COLLATE area */ + /* Collation returning equal is suspicious + * because non-collating characters (characters + * that do not have _any_ collation value) exist. + * Suppose '@' is such a character (this is not + * a far-fetched example). Now, sort these: + * qw(a@b ab a@@b @ab @a@b) */ - if (SvCUR(str1) < SvCUR(str2)) { - /*SUPPRESS 560*/ - if (retval = memcmp(SvPVX(str1), SvPVX(str2), SvCUR(str1))) - return retval; - else - return -1; - } - /*SUPPRESS 560*/ - else if (retval = memcmp(SvPVX(str1), SvPVX(str2), SvCUR(str2))) - return retval; - else if (SvCUR(str1) == SvCUR(str2)) - return 0; - else - return 1; +#endif /* LC_COLLATE */ + + SORTCMP(); } PP(pp_reset) diff -rwuN perl5.003_07.dist/proto.h perl5.003_07/proto.h --- perl5.003_07.dist/proto.h Mon Oct 7 22:33:11 1996 +++ perl5.003_07/proto.h Fri Nov 15 22:06:33 1996 @@ -41,6 +41,7 @@ #endif OP * ck_gvconst _((OP * o)); OP * ck_retarget _((OP *op)); +int collating_by_locale _(()); OP* convert _((I32 optype, I32 flags, OP* op)); char* cpytill _((char* to, char* from, char* fromend, int delim, I32* retlen)); void croak _((char* pat,...)) __attribute__((format(printf,1,2),noreturn)); @@ -194,6 +195,7 @@ int magic_setarylen _((SV* sv, MAGIC* mg)); int magic_setbm _((SV* sv, MAGIC* mg)); int magic_setdbline _((SV* sv, MAGIC* mg)); +int magic_setcollxfrm _((SV* sv, MAGIC* mg)); int magic_setenv _((SV* sv, MAGIC* mg)); int magic_setisa _((SV* sv, MAGIC* mg)); int magic_setglob _((SV* sv, MAGIC* mg)); @@ -329,8 +331,9 @@ AV* perl_get_av _((char* name, I32 create)); HV* perl_get_hv _((char* name, I32 create)); CV* perl_get_cv _((char* name, I32 create)); -int perl_init_fold _(()); +void perl_init_fold _(()); int perl_init_i18nl10n _((int printwarn)); +int perl_init_i18nl14n _((int printwarn)); int perl_parse _((PerlInterpreter* sv_interp, void(*xsinit)(void), int argc, char** argv, char** env)); void perl_require_pv _((char* pv)); #define perl_requirepv perl_require_pv @@ -437,6 +440,7 @@ void sv_clean_objs _((void)); void sv_clear _((SV* sv)); I32 sv_cmp _((SV* sv1, SV* sv2)); +char* sv_collxfrm _((SV* sv, STRLEN* nxp)); void sv_dec _((SV* sv)); void sv_dump _((SV* sv)); I32 sv_eq _((SV* sv1, SV* sv2)); diff -rwuN perl5.003_07.dist/sv.c perl5.003_07/sv.c --- perl5.003_07.dist/sv.c Wed Oct 9 18:31:53 1996 +++ perl5.003_07/sv.c Sat Nov 16 18:54:48 1996 @@ -2164,6 +2164,9 @@ case 'l': mg->mg_virtual = &vtbl_dbline; break; + case 'o': + mg->mg_virtual = &vtbl_collxfrm; + break; case 'P': mg->mg_virtual = &vtbl_pack; break; @@ -2603,84 +2606,60 @@ return !memcmp(pv1, pv2, cur1); } +/* Note that the sv_cmp() bears a strong resemblance + * to both of the sortcmp() and sortcmplocale(). */ I32 sv_cmp(str1,str2) register SV *str1; register SV *str2; { - I32 retval; - char *pv1; + register I32 retval; + register char *pv1; STRLEN cur1; - char *pv2; + register char *pv2; STRLEN cur2; - if (lc_collate_active) { /* NOTE: this is the LC_COLLATE branch */ - - if (!str1) { - pv1 = ""; - cur1 = 0; - } else { - pv1 = SvPV(str1, cur1); + if (!str1) + return str2 ? -1 : 0; + if (!str2) + return 1; - { - STRLEN cur1x; - char * pv1x = mem_collxfrm(pv1, cur1, &cur1x); +#ifdef LC_COLLATE - pv1 = pv1x; - cur1 = cur1x; - } - } + if (collating_by_locale()) { - if (!str2) { - pv2 = ""; - cur2 = 0; - } else { - pv2 = SvPV(str2, cur2); + pv1 = sv_collxfrm(str1, &cur1); + pv2 = sv_collxfrm(str2, &cur2); - { - STRLEN cur2x; - char * pv2x = mem_collxfrm(pv2, cur2, &cur2x); - - pv2 = pv2x; - cur2 = cur2x; - } - } - - if (!cur1) { - Safefree(pv2); - return cur2 ? -1 : 0; + if (!pv1 || !cur1) { + if (!pv2 || !cur2) + goto do_memcmp; + return -1; } - if (!cur2) { - Safefree(pv1); + if (!pv2 || !cur2) return 1; - } retval = memcmp((void*)pv1, (void*)pv2, cur1 < cur2 ? cur1 : cur2); - Safefree(pv1); - Safefree(pv2); - if (retval) return retval < 0 ? -1 : 1; - if (cur1 == cur2) - return 0; - else - return cur1 < cur2 ? -1 : 1; + /* Yes, this fall-through is intentional. + * In the case collation xfrmed strings memcmp + * equal they might contain non-collating + * characters like '-' which do not have _any_ + * collation value. In this case we do + * a normal memcmp. */ + } - } else { /* NOTE: this is the non-LC_COLLATE branch */ +do_memcmp: - if (!str1) { - pv1 = ""; - cur1 = 0; - } else - pv1 = SvPV(str1, cur1); +#endif /* LC_COLLATE */ - if (!str2) { - pv2 = ""; - cur2 = 0; - } else + /* This the non-collating compare. */ + + pv1 = SvPV(str1, cur1); pv2 = SvPV(str2, cur2); if (!cur1) @@ -2699,6 +2678,64 @@ else return cur1 < cur2 ? -1 : 1; } + +char * +sv_collxfrm(sv, nxp) + SV * sv; + STRLEN * nxp; +{ + char * mx; + +#ifdef LC_COLLATE + + STRLEN nx; + MAGIC * mg = 0; + + if (!SvMAGICAL(sv)) /* If no magic go and do some. */ + goto xfrm; /* non-structured but fast. */ + + if ((mg = mg_find(sv, 'o'))) { + /* If the "collation generation" is obsolete, re-xfrm. + * POSIX::setlocale() bumps the generation index forward. */ + if (*(IV *)(mg->mg_ptr) < collationix) + goto xfrm; /* non-structured but fast. */ + + mx = mg->mg_ptr; + nx = mg->mg_len; + + } else { + + xfrm: + + /* Any scalar variable may carry an 'o' magic that contains the + * scalar data of the variable transformed to such a format that + * a normal memcmp() can be used to compare the data according + * to the locale settings. */ + + { + char * pv; + STRLEN len; + + pv = SvPV(sv, len); + if ((mx = mem_collxfrm(pv, len, &nx))) { + sv_magic(sv, 0, 'o', 0, 0); + if ((mg = mg_find(sv, 'o'))) { + /* The current "collation generation index". */ + *(IV *)(mx) = collationix; + mg->mg_ptr = mx; + mg->mg_len = nx; + } else + return 0; + } else + return 0; + } + } + + * nxp = nx; + +#endif /* LC_COLLATE */ + + return mx + sizeof(collationix); } char * diff -rwuN perl5.003_07.dist/util.c perl5.003_07/util.c --- perl5.003_07.dist/util.c Thu Oct 10 21:49:01 1996 +++ perl5.003_07/util.c Sat Nov 16 18:56:10 1996 @@ -404,16 +404,19 @@ return Nullch; } -/* Initialize the fold[] array. */ -int +/* Initialize the character classes and the fold[] array. */ +void perl_init_fold() { int i; for (i = 0; i < 256; i++) { - if (isUPPER(i)) fold[i] = toLOWER(i); - else if (isLOWER(i)) fold[i] = toUPPER(i); - else fold[i] = i; + if (isUPPER(i)) + fold[i] = toLOWER(i); + else if (isLOWER(i)) + fold[i] = toUPPER(i); + else + fold[i] = i; } } @@ -431,25 +434,17 @@ #if defined(HAS_SETLOCALE) char * lc_all = getenv("LC_ALL"); char * lc_ctype = getenv("LC_CTYPE"); - char * lc_collate = getenv("LC_COLLATE"); char * lang = getenv("LANG"); int setlocale_failure = 0; +#ifdef LC_CTYPE + #define SETLOCALE_LC_CTYPE 0x01 -#define SETLOCALE_LC_COLLATE 0x02 -#ifdef LC_CTYPE if (setlocale(LC_CTYPE, "") == 0) setlocale_failure |= SETLOCALE_LC_CTYPE; #endif -#ifdef LC_COLLATE - if (setlocale(LC_COLLATE, "") == 0) - setlocale_failure |= SETLOCALE_LC_COLLATE; - else - lc_collate_active = 1; -#endif - if (setlocale_failure && (lc_all || lang)) { char *perl_badlang; @@ -464,11 +459,6 @@ PerlIO_printf(PerlIO_stderr(), "LC_CTYPE "); #endif -#ifdef LC_COLLATE - if (setlocale_failure & SETLOCALE_LC_COLLATE) - PerlIO_printf(PerlIO_stderr(), - "LC_COLLATE "); -#endif PerlIO_printf(PerlIO_stderr(), "\n"); @@ -490,15 +480,6 @@ lc_ctype ? '"' : ')' ); #endif -#ifdef LC_COLLATE - if (setlocale_failure & SETLOCALE_LC_COLLATE) - PerlIO_printf(PerlIO_stderr(), - "\tLC_COLLATE = %c%s%c,\n", - lc_collate ? '"' : '(', - lc_collate ? lc_collate : "unset", - lc_collate ? '"' : ')' - ); -#endif PerlIO_printf(PerlIO_stderr(), "\tLANG = %c%s%c\n", lang ? '"' : ')', @@ -528,35 +509,76 @@ #endif } - if (setlocale_failure & SETLOCALE_LC_CTYPE == 0) +#ifdef SETLOCALE_LC_CTYPE + if (!(setlocale_failure & SETLOCALE_LC_CTYPE)) perl_init_fold(); +#endif + +#ifdef LC_COLLATE + collationpv = 0; + collationix = 0; + collxfrm_xf = 2; /* a reasonable guess. */ +#endif #endif /* #if defined(HAS_SETLOCALE) */ return ok; } +/* Backwards compatibility. */ +int +perl_init_i18nl14n(printwarn) + int printwarn; +{ + perl_init_i18nl10n(printwarn); +} + +int +collating_by_locale +() +{ + int localecollate = 0; + SV * lccsv; + + if (lccsv = perl_get_sv("Locale::Collate::_compare", FALSE)) { + char * pv; + STRLEN len; + +#if defined(HAS_SETLOCALE) && defined(LC_COLLATE) + /* catch 'fake locale collation' */ + char * lc_collate = setlocale(LC_COLLATE, NULL); + + if (!lc_collate || (lc_collate[0] == 'C' && lc_collate[1] == 0)) + return 0; +#endif + pv = SvPV(lccsv, len); + localecollate = strnEQ(pv, "locale", 6); + } + + return localecollate; +} + +/* mem_collxfrm() is a bit like strxfrm() but with two important + * differences. First, it handles embedded NULs. Second, it allocates + * a bit more memory than needed for the transformed data itself. + * The real transformed data begins at offset sizeof(collationix). + * Please see sv_collxfrm() to see how this is used. */ char * -mem_collxfrm(m, n, nx) /* mem_collxfrm() does strxfrm() for (data,size) */ - const char *m; /* "strings", that is, transforms normal eight-bit */ - const Size_t n; /* data into a format that can be memcmp()ed to get */ - Size_t * nx; /* 'the right' result for each locale. */ -{ /* Uses strxfrm() but handles embedded NULs. */ +mem_collxfrm(m, n, nx) + const char *m; + const Size_t n; + Size_t * nx; +{ char * mx = 0; #ifdef HAS_STRXFRM Size_t ma; + Size_t ox; - /* the expansion factor of 16 has been seen with strxfrm() */ - ma = (lc_collate_active ? 16 : 1) * n + 1; + /* the first sizeof(collationix) bytes are used by sv_collxfrm(). */ + /* the +1 is for the terminating NUL. */ -#define RENEW_mx() \ - do { \ - ma = 2 * ma + 1; \ - Renew(mx, ma, char); \ - if (mx == 0) \ - goto out; \ - } while (0) + ma = sizeof(collationix) + (collating_by_locale() ? collxfrm_xf : 1) * n + 1; New(171, mx, ma, char); @@ -564,29 +586,38 @@ Size_t xc, dx; int xok; - for (*nx = 0, xc = 0; xc < n; ) { - if (m[xc] == 0) - do { - if (*nx == ma) - RENEW_mx(); - mx[*nx++] = m[xc++]; - } while (xc < n && m[xc] == 0); - else { + for (ox = sizeof(collationix), xc = 0; xc < n; ) { do { - dx = strxfrm(mx + *nx, m + xc, ma - *nx); - if (dx + *nx > ma) { - RENEW_mx(); + dx = strxfrm(mx + ox, m + xc, ma - ox); + if (dx == (Size_t) -1) + goto bad; + if (dx + ox > ma) { + ma = 2 * ma + 1; + Renew(mx, ma, char); + if (mx == 0) + goto bad; xok = 0; } else xok = 1; } while (!xok); - xc += strlen(mx + *nx); - *nx += dx; - } + xc += strlen(m + xc) + 1; + ox += dx; + /* Embedded NULs are understood but silently skipped + * because they make no sense in locale collation. */ } + mx[ox] = 0; + *nx = ox - sizeof(collationix); + goto good; } -out: +bad: + + if (mx) + Safefree(mx); + mx = 0; + *nx = 0; + +good: #endif /* HAS_STRXFRM */