Revision 09f68782
Von Moritz Bunkus vor mehr als 5 Jahren hinzugefügt
modules/fallback/Sort/Naturally.pm | ||
---|---|---|
1 |
|
|
2 |
require 5; |
|
3 |
package Sort::Naturally; # Time-stamp: "2004-12-29 18:30:03 AST" |
|
4 |
$VERSION = '1.02'; |
|
5 |
@EXPORT = ('nsort', 'ncmp'); |
|
6 |
require Exporter; |
|
7 |
@ISA = ('Exporter'); |
|
8 |
|
|
9 |
use strict; |
|
10 |
use locale; |
|
11 |
use integer; |
|
12 |
|
|
13 |
#----------------------------------------------------------------------------- |
|
14 |
# constants: |
|
15 |
BEGIN { *DEBUG = sub () {0} unless defined &DEBUG } |
|
16 |
|
|
17 |
use Config (); |
|
18 |
BEGIN { |
|
19 |
# Make a constant such that if a whole-number string is that long |
|
20 |
# or shorter, we KNOW it's treatable as an integer |
|
21 |
no integer; |
|
22 |
my $x = length(256 ** $Config::Config{'intsize'} / 2) - 1; |
|
23 |
die "Crazy intsize: <$Config::Config{'intsize'}>" if $x < 4; |
|
24 |
eval 'sub MAX_INT_SIZE () {' . $x . '}'; |
|
25 |
die $@ if $@; |
|
26 |
print "intsize $Config::Config{'intsize'} => MAX_INT_SIZE $x\n" if DEBUG; |
|
27 |
} |
|
28 |
|
|
29 |
sub X_FIRST () {-1} |
|
30 |
sub Y_FIRST () { 1} |
|
31 |
|
|
32 |
my @ORD = ('same', 'swap', 'asis'); |
|
33 |
|
|
34 |
#----------------------------------------------------------------------------- |
|
35 |
# For lack of a preprocessor: |
|
36 |
|
|
37 |
my($code, $guts); |
|
38 |
$guts = <<'EOGUTS'; # This is the guts of both ncmp and nsort: |
|
39 |
|
|
40 |
if($x eq $y) { |
|
41 |
# trap this expensive case first, and then fall thru to tiebreaker |
|
42 |
$rv = 0; |
|
43 |
|
|
44 |
# Convoluted hack to get numerics to sort first, at string start: |
|
45 |
} elsif($x =~ m/^\d/s) { |
|
46 |
if($y =~ m/^\d/s) { |
|
47 |
$rv = 0; # fall thru to normal comparison for the two numbers |
|
48 |
} else { |
|
49 |
$rv = X_FIRST; |
|
50 |
DEBUG > 1 and print "Numeric-initial $x trumps letter-initial $y\n"; |
|
51 |
} |
|
52 |
} elsif($y =~ m/^\d/s) { |
|
53 |
$rv = Y_FIRST; |
|
54 |
DEBUG > 1 and print "Numeric-initial $y trumps letter-initial $x\n"; |
|
55 |
} else { |
|
56 |
$rv = 0; |
|
57 |
} |
|
58 |
|
|
59 |
unless($rv) { |
|
60 |
# Normal case: |
|
61 |
$rv = 0; |
|
62 |
DEBUG and print "<$x> and <$y> compared...\n"; |
|
63 |
|
|
64 |
Consideration: |
|
65 |
while(length $x and length $y) { |
|
66 |
|
|
67 |
DEBUG > 2 and print " <$x> and <$y>...\n"; |
|
68 |
|
|
69 |
# First, non-numeric comparison: |
|
70 |
$x2 = ($x =~ m/^(\D+)/s) ? length($1) : 0; |
|
71 |
$y2 = ($y =~ m/^(\D+)/s) ? length($1) : 0; |
|
72 |
# Now make x2 the min length of the two: |
|
73 |
$x2 = $y2 if $x2 > $y2; |
|
74 |
if($x2) { |
|
75 |
DEBUG > 1 and printf " <%s> and <%s> lexically for length $x2...\n", |
|
76 |
substr($x,0,$x2), substr($y,0,$x2); |
|
77 |
do { |
|
78 |
my $i = substr($x,0,$x2); |
|
79 |
my $j = substr($y,0,$x2); |
|
80 |
my $sv = $i cmp $j; |
|
81 |
print "SCREAM! on <$i><$j> -- $sv != $rv \n" unless $rv == $sv; |
|
82 |
last; |
|
83 |
} |
|
84 |
|
|
85 |
|
|
86 |
if $rv = |
|
87 |
# The ''. things here force a copy that seems to work around a |
|
88 |
# mysterious intermittent bug that 'use locale' provokes in |
|
89 |
# many versions of Perl. |
|
90 |
$cmp |
|
91 |
? $cmp->(substr($x,0,$x2) . '', |
|
92 |
substr($y,0,$x2) . '', |
|
93 |
) |
|
94 |
: |
|
95 |
scalar(( substr($x,0,$x2) . '' ) cmp |
|
96 |
( substr($y,0,$x2) . '' ) |
|
97 |
) |
|
98 |
; |
|
99 |
# otherwise trim and keep going: |
|
100 |
substr($x,0,$x2) = ''; |
|
101 |
substr($y,0,$x2) = ''; |
|
102 |
} |
|
103 |
|
|
104 |
# Now numeric: |
|
105 |
# (actually just using $x2 and $y2 as scratch) |
|
106 |
|
|
107 |
if( $x =~ s/^(\d+)//s ) { |
|
108 |
$x2 = $1; |
|
109 |
if( $y =~ s/^(\d+)//s ) { |
|
110 |
# We have two numbers here. |
|
111 |
DEBUG > 1 and print " <$x2> and <$1> numerically\n"; |
|
112 |
if(length($x2) < MAX_INT_SIZE and length($1) < MAX_INT_SIZE) { |
|
113 |
# small numbers: we can compare happily |
|
114 |
last if $rv = $x2 <=> $1; |
|
115 |
} else { |
|
116 |
# ARBITRARILY large integers! |
|
117 |
|
|
118 |
# This saves on loss of precision that could happen |
|
119 |
# with actual stringification. |
|
120 |
# Also, I sense that very large numbers aren't too |
|
121 |
# terribly common in sort data. |
|
122 |
|
|
123 |
# trim leading 0's: |
|
124 |
($y2 = $1) =~ s/^0+//s; |
|
125 |
$x2 =~ s/^0+//s; |
|
126 |
print " Treating $x2 and $y2 as bigint\n" if DEBUG; |
|
127 |
|
|
128 |
no locale; # we want the dumb cmp back. |
|
129 |
last if $rv = ( |
|
130 |
# works only for non-negative whole numbers: |
|
131 |
length($x2) <=> length($y2) |
|
132 |
# the longer the numeral, the larger the value |
|
133 |
or $x2 cmp $y2 |
|
134 |
# between equals, compare lexically!! amazing but true. |
|
135 |
); |
|
136 |
} |
|
137 |
} else { |
|
138 |
# X is numeric but Y isn't |
|
139 |
$rv = Y_FIRST; |
|
140 |
last; |
|
141 |
} |
|
142 |
} elsif( $y =~ s/^\d+//s ) { # we don't need to capture the substring |
|
143 |
$rv = X_FIRST; |
|
144 |
last; |
|
145 |
} |
|
146 |
# else one of them is 0-length. |
|
147 |
|
|
148 |
# end-while |
|
149 |
} |
|
150 |
} |
|
151 |
EOGUTS |
|
152 |
|
|
153 |
sub maker { |
|
154 |
my $code = $_[0]; |
|
155 |
$code =~ s/~COMPARATOR~/$guts/g || die "Can't find ~COMPARATOR~"; |
|
156 |
eval $code; |
|
157 |
die $@ if $@; |
|
158 |
} |
|
159 |
|
|
160 |
############################################################################## |
|
161 |
|
|
162 |
maker(<<'EONSORT'); |
|
163 |
sub nsort { |
|
164 |
# get options: |
|
165 |
my($cmp, $lc); |
|
166 |
($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY'; |
|
167 |
|
|
168 |
return @_ unless @_ > 1 or wantarray; # be clever |
|
169 |
|
|
170 |
my($x, $x2, $y, $y2, $rv); # scratch vars |
|
171 |
|
|
172 |
# We use a Schwartzian xform to memoize the lc'ing and \W-removal |
|
173 |
|
|
174 |
map $_->[0], |
|
175 |
sort { |
|
176 |
if($a->[0] eq $b->[0]) { 0 } # trap this expensive case |
|
177 |
else { |
|
178 |
|
|
179 |
$x = $a->[1]; |
|
180 |
$y = $b->[1]; |
|
181 |
|
|
182 |
~COMPARATOR~ |
|
183 |
|
|
184 |
# Tiebreakers... |
|
185 |
DEBUG > 1 and print " -<${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n"; |
|
186 |
$rv ||= (length($x) <=> length($y)) # shorter is always first |
|
187 |
|| ($cmp and $cmp->($x,$y) || $cmp->($a->[0], $b->[0])) |
|
188 |
|| ($x cmp $y ) |
|
189 |
|| ($a->[0] cmp $b->[0]) |
|
190 |
; |
|
191 |
|
|
192 |
DEBUG > 1 and print " <${$a}[0]> cmp <${$b}[0]> is $rv ($ORD[$rv])\n"; |
|
193 |
$rv; |
|
194 |
}} |
|
195 |
|
|
196 |
map {; |
|
197 |
$x = $lc ? $lc->($_) : lc($_); # x as scratch |
|
198 |
$x =~ s/\W+//s; |
|
199 |
[$_, $x]; |
|
200 |
} |
|
201 |
@_ |
|
202 |
} |
|
203 |
EONSORT |
|
204 |
|
|
205 |
#----------------------------------------------------------------------------- |
|
206 |
maker(<<'EONCMP'); |
|
207 |
sub ncmp { |
|
208 |
# The guts are basically the same as above... |
|
209 |
|
|
210 |
# get options: |
|
211 |
my($cmp, $lc); |
|
212 |
($cmp,$lc) = @{shift @_} if @_ and ref($_[0]) eq 'ARRAY'; |
|
213 |
|
|
214 |
if(@_ == 0) { |
|
215 |
@_ = ($a, $b); # bit of a hack! |
|
216 |
DEBUG > 1 and print "Hacking in <$a><$b>\n"; |
|
217 |
} elsif(@_ != 2) { |
|
218 |
require Carp; |
|
219 |
Carp::croak("Not enough options to ncmp!"); |
|
220 |
} |
|
221 |
my($a,$b) = @_; |
|
222 |
my($x, $x2, $y, $y2, $rv); # scratch vars |
|
223 |
|
|
224 |
DEBUG > 1 and print "ncmp args <$a><$b>\n"; |
|
225 |
if($a eq $b) { # trap this expensive case |
|
226 |
0; |
|
227 |
} else { |
|
228 |
$x = ($lc ? $lc->($a) : lc($a)); |
|
229 |
$x =~ s/\W+//s; |
|
230 |
$y = ($lc ? $lc->($b) : lc($b)); |
|
231 |
$y =~ s/\W+//s; |
|
232 |
|
|
233 |
~COMPARATOR~ |
|
234 |
|
|
235 |
|
|
236 |
# Tiebreakers... |
|
237 |
DEBUG > 1 and print " -<$a> cmp <$b> is $rv ($ORD[$rv])\n"; |
|
238 |
$rv ||= (length($x) <=> length($y)) # shorter is always first |
|
239 |
|| ($cmp and $cmp->($x,$y) || $cmp->($a,$b)) |
|
240 |
|| ($x cmp $y) |
|
241 |
|| ($a cmp $b) |
|
242 |
; |
|
243 |
|
|
244 |
DEBUG > 1 and print " <$a> cmp <$b> is $rv\n"; |
|
245 |
$rv; |
|
246 |
} |
|
247 |
} |
|
248 |
EONCMP |
|
249 |
|
|
250 |
# clean up: |
|
251 |
undef $guts; |
|
252 |
undef &maker; |
|
253 |
|
|
254 |
#----------------------------------------------------------------------------- |
|
255 |
1; |
|
256 |
|
|
257 |
############### END OF MAIN SOURCE ########################################### |
|
258 |
__END__ |
|
259 |
|
|
260 |
=head1 NAME |
|
261 |
|
|
262 |
Sort::Naturally -- sort lexically, but sort numeral parts numerically |
|
263 |
|
|
264 |
=head1 SYNOPSIS |
|
265 |
|
|
266 |
@them = nsort(qw( |
|
267 |
foo12a foo12z foo13a foo 14 9x foo12 fooa foolio Foolio Foo12a |
|
268 |
)); |
|
269 |
print join(' ', @them), "\n"; |
|
270 |
|
|
271 |
Prints: |
|
272 |
|
|
273 |
9x 14 foo fooa foolio Foolio foo12 foo12a Foo12a foo12z foo13a |
|
274 |
|
|
275 |
(Or "foo12a" + "Foo12a" and "foolio" + "Foolio" and might be |
|
276 |
switched, depending on your locale.) |
|
277 |
|
|
278 |
=head1 DESCRIPTION |
|
279 |
|
|
280 |
This module exports two functions, C<nsort> and C<ncmp>; they are used |
|
281 |
in implementing my idea of a "natural sorting" algorithm. Under natural |
|
282 |
sorting, numeric substrings are compared numerically, and other |
|
283 |
word-characters are compared lexically. |
|
284 |
|
|
285 |
This is the way I define natural sorting: |
|
286 |
|
|
287 |
=over |
|
288 |
|
|
289 |
=item * |
|
290 |
|
|
291 |
Non-numeric word-character substrings are sorted lexically, |
|
292 |
case-insensitively: "Foo" comes between "fish" and "fowl". |
|
293 |
|
|
294 |
=item * |
|
295 |
|
|
296 |
Numeric substrings are sorted numerically: |
|
297 |
"100" comes after "20", not before. |
|
298 |
|
|
299 |
=item * |
|
300 |
|
|
301 |
\W substrings (neither words-characters nor digits) are I<ignored>. |
|
302 |
|
|
303 |
=item * |
|
304 |
|
|
305 |
Our use of \w, \d, \D, and \W is locale-sensitive: Sort::Naturally |
|
306 |
uses a C<use locale> statement. |
|
307 |
|
|
308 |
=item * |
|
309 |
|
|
310 |
When comparing two strings, where a numeric substring in one |
|
311 |
place is I<not> up against a numeric substring in another, |
|
312 |
the non-numeric always comes first. This is fudged by |
|
313 |
reading pretending that the lack of a number substring has |
|
314 |
the value -1, like so: |
|
315 |
|
|
316 |
foo => "foo", -1 |
|
317 |
foobar => "foo", -1, "bar" |
|
318 |
foo13 => "foo", 13, |
|
319 |
foo13xyz => "foo", 13, "xyz" |
|
320 |
|
|
321 |
That's so that "foo" will come before "foo13", which will come |
|
322 |
before "foobar". |
|
323 |
|
|
324 |
=item * |
|
325 |
|
|
326 |
The start of a string is exceptional: leading non-\W (non-word, |
|
327 |
non-digit) |
|
328 |
components are are ignored, and numbers come I<before> letters. |
|
329 |
|
|
330 |
=item * |
|
331 |
|
|
332 |
I define "numeric substring" just as sequences matching m/\d+/ -- |
|
333 |
scientific notation, commas, decimals, etc., are not seen. If |
|
334 |
your data has thousands separators in numbers |
|
335 |
("20,000 Leagues Under The Sea" or "20.000 lieues sous les mers"), |
|
336 |
consider stripping them before feeding them to C<nsort> or |
|
337 |
C<ncmp>. |
|
338 |
|
|
339 |
=back |
|
340 |
|
|
341 |
=head2 The nsort function |
|
342 |
|
|
343 |
This function takes a list of strings, and returns a copy of the list, |
|
344 |
sorted. |
|
345 |
|
|
346 |
This is what most people will want to use: |
|
347 |
|
|
348 |
@stuff = nsort(...list...); |
|
349 |
|
|
350 |
When nsort needs to compare non-numeric substrings, it |
|
351 |
uses Perl's C<lc> function in scope of a <use locale>. |
|
352 |
And when nsort needs to lowercase things, it uses Perl's |
|
353 |
C<lc> function in scope of a <use locale>. If you want nsort |
|
354 |
to use other functions instead, you can specify them in |
|
355 |
an arrayref as the first argument to nsort: |
|
356 |
|
|
357 |
@stuff = nsort( [ |
|
358 |
\&string_comparator, # optional |
|
359 |
\&lowercaser_function # optional |
|
360 |
], |
|
361 |
...list... |
|
362 |
); |
|
363 |
|
|
364 |
If you want to specify a string comparator but no lowercaser, |
|
365 |
then the options list is C<[\&comparator, '']> or |
|
366 |
C<[\&comparator]>. If you want to specify no string comparator |
|
367 |
but a lowercaser, then the options list is |
|
368 |
C<['', \&lowercaser]>. |
|
369 |
|
|
370 |
Any comparator you specify is called as |
|
371 |
C<$comparator-E<gt>($left, $right)>, |
|
372 |
and, like a normal Perl C<cmp> replacement, must return |
|
373 |
-1, 0, or 1 depending on whether the left argument is stringwise |
|
374 |
less than, equal to, or greater than the right argument. |
|
375 |
|
|
376 |
Any lowercaser function you specify is called as |
|
377 |
C<$lowercased = $lowercaser-E<gt>($original)>. The routine |
|
378 |
must not modify its C<$_[0]>. |
|
379 |
|
|
380 |
=head2 The ncmp function |
|
381 |
|
|
382 |
Often, when sorting non-string values like this: |
|
383 |
|
|
384 |
@objects_sorted = sort { $a->tag cmp $b->tag } @objects; |
|
385 |
|
|
386 |
...or even in a Schwartzian transform, like this: |
|
387 |
|
|
388 |
@strings = |
|
389 |
map $_->[0] |
|
390 |
sort { $a->[1] cmp $b->[1] } |
|
391 |
map { [$_, make_a_sort_key_from($_) ] |
|
392 |
@_ |
|
393 |
; |
|
394 |
|
|
395 |
...you wight want something that replaces not C<sort>, but C<cmp>. |
|
396 |
That's what Sort::Naturally's C<ncmp> function is for. Call it with |
|
397 |
the syntax C<ncmp($left,$right)> instead of C<$left cmp $right>, |
|
398 |
but otherwise it's a fine replacement: |
|
399 |
|
|
400 |
@objects_sorted = sort { ncmp($a->tag,$b->tag) } @objects; |
|
401 |
|
|
402 |
@strings = |
|
403 |
map $_->[0] |
|
404 |
sort { ncmp($a->[1], $b->[1]) } |
|
405 |
map { [$_, make_a_sort_key_from($_) ] |
|
406 |
@_ |
|
407 |
; |
|
408 |
|
|
409 |
Just as with C<nsort> can take different a string-comparator |
|
410 |
and/or lowercaser, you can do the same with C<ncmp>, by passing |
|
411 |
an arrayref as the first argument: |
|
412 |
|
|
413 |
ncmp( [ |
|
414 |
\&string_comparator, # optional |
|
415 |
\&lowercaser_function # optional |
|
416 |
], |
|
417 |
$left, $right |
|
418 |
) |
|
419 |
|
|
420 |
You might get string comparators from L<Sort::ArbBiLex|Sort::ArbBiLex>. |
|
421 |
|
|
422 |
=head1 NOTES |
|
423 |
|
|
424 |
=over |
|
425 |
|
|
426 |
=item * |
|
427 |
|
|
428 |
This module is not a substitute for |
|
429 |
L<Sort::Versions|Sort::Versions>! If |
|
430 |
you just need proper version sorting, use I<that!> |
|
431 |
|
|
432 |
=item * |
|
433 |
|
|
434 |
If you need something that works I<sort of> like this module's |
|
435 |
functions, but not quite the same, consider scouting thru this |
|
436 |
module's source code, and adapting what you see. Besides |
|
437 |
the functions that actually compile in this module, after the POD, |
|
438 |
there's several alternate attempts of mine at natural sorting |
|
439 |
routines, which are not compiled as part of the module, but which you |
|
440 |
might find useful. They should all be I<working> implementations of |
|
441 |
slightly different algorithms |
|
442 |
(all of them based on Martin Pool's C<nsort>) which I eventually |
|
443 |
discarded in favor of my algorithm. If you are having to |
|
444 |
naturally-sort I<very large> data sets, and sorting is getting |
|
445 |
ridiculously slow, you might consider trying one of those |
|
446 |
discarded functions -- I have a feeling they might be faster on |
|
447 |
large data sets. Benchmark them on your data and see. (Unless |
|
448 |
you I<need> the speed, don't bother. Hint: substitute C<sort> |
|
449 |
for C<nsort> in your code, and unless your program speeds up |
|
450 |
drastically, it's not the sorting that's slowing things down. |
|
451 |
But if it I<is> C<nsort> that's slowing things down, consider |
|
452 |
just: |
|
453 |
|
|
454 |
if(@set >= SOME_VERY_BIG_NUMBER) { |
|
455 |
no locale; # vroom vroom |
|
456 |
@sorted = sort(@set); # feh, good enough |
|
457 |
} elsif(@set >= SOME_BIG_NUMBER) { |
|
458 |
use locale; |
|
459 |
@sorted = sort(@set); # feh, good enough |
|
460 |
} else { |
|
461 |
# but keep it pretty for normal cases |
|
462 |
@sorted = nsort(@set); |
|
463 |
} |
|
464 |
|
|
465 |
=item * |
|
466 |
|
|
467 |
If you do adapt the routines in this module, email me; I'd |
|
468 |
just be interested in hearing about it. |
|
469 |
|
|
470 |
=item * |
|
471 |
|
|
472 |
Thanks to the EFNet #perl people for encouraging this module, |
|
473 |
especially magister and a-mused. |
|
474 |
|
|
475 |
=back |
|
476 |
|
|
477 |
=head1 COPYRIGHT AND DISCLAIMER |
|
478 |
|
|
479 |
Copyright 2001, Sean M. Burke C<sburke@cpan.org>, all rights |
|
480 |
reserved. This program is free software; you can redistribute it |
|
481 |
and/or modify it under the same terms as Perl itself. |
|
482 |
|
|
483 |
This program is distributed in the hope that it will be useful, but |
|
484 |
without any warranty; without even the implied warranty of |
|
485 |
merchantability or fitness for a particular purpose. |
|
486 |
|
|
487 |
=head1 AUTHOR |
|
488 |
|
|
489 |
Sean M. Burke C<sburke@cpan.org> |
|
490 |
|
|
491 |
=cut |
|
492 |
|
|
493 |
############ END OF DOCS ############ |
|
494 |
|
|
495 |
############################################################################ |
|
496 |
############################################################################ |
|
497 |
|
|
498 |
############ BEGIN OLD STUFF ############ |
|
499 |
|
|
500 |
# We can't have "use integer;", or else (5 <=> 5.1) comes out "0" ! |
|
501 |
|
|
502 |
#----------------------------------------------------------------------------- |
|
503 |
sub nsort { |
|
504 |
my($cmp, $lc); |
|
505 |
return @_ if @_ < 2; # Just to be CLEVER. |
|
506 |
|
|
507 |
my($x, $i); # scratch vars |
|
508 |
|
|
509 |
# And now, the GREAT BIG Schwartzian transform: |
|
510 |
|
|
511 |
map |
|
512 |
$_->[0], |
|
513 |
|
|
514 |
sort { |
|
515 |
# Uses $i as the index variable, $x as the result. |
|
516 |
$x = 0; |
|
517 |
$i = 1; |
|
518 |
DEBUG and print "\nComparing ", map("{$_}", @$a), |
|
519 |
' : ', map("{$_}", @$b), , "...\n"; |
|
520 |
|
|
521 |
while($i < @$a and $i < @$b) { |
|
522 |
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ", |
|
523 |
$a->[$i] cmp $b->[$i], "\n"; |
|
524 |
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic |
|
525 |
++$i; |
|
526 |
|
|
527 |
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ", |
|
528 |
$a->[$i] <=> $b->[$i], "\n"; |
|
529 |
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric |
|
530 |
++$i; |
|
531 |
} |
|
532 |
|
|
533 |
DEBUG and print "{$a->[0]} : {$b->[0]} is ", |
|
534 |
$x || (@$a <=> @$b) || 0 |
|
535 |
,"\n" |
|
536 |
; |
|
537 |
$x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]); |
|
538 |
# unless we found a result for $x in the while loop, |
|
539 |
# use length as a tiebreaker, otherwise use cmp |
|
540 |
# on the original string as a fallback tiebreaker. |
|
541 |
} |
|
542 |
|
|
543 |
map { |
|
544 |
my @bit = ($x = defined($_) ? $_ : ''); |
|
545 |
|
|
546 |
if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) { |
|
547 |
# It's entirely purely numeric, so treat it specially: |
|
548 |
push @bit, '', $x; |
|
549 |
} else { |
|
550 |
# Consume the string. |
|
551 |
while(length $x) { |
|
552 |
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : ''; |
|
553 |
push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0; |
|
554 |
} |
|
555 |
} |
|
556 |
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n"; |
|
557 |
|
|
558 |
# End result: [original bit , (text, number), (text, number), ...] |
|
559 |
# Minimally: [0-length original bit,] |
|
560 |
# Examples: |
|
561 |
# ['10' => '' , 10, ] |
|
562 |
# ['fo900' => 'fo' , 900, ] |
|
563 |
# ['foo10' => 'foo', 10, ] |
|
564 |
# ['foo9.pl' => 'foo', 9, , '.pl', 0 ] |
|
565 |
# ['foo32.pl' => 'foo', 32, , '.pl', 0 ] |
|
566 |
# ['foo325.pl' => 'foo', 325, , '.pl', 0 ] |
|
567 |
# Yes, always an ODD number of elements. |
|
568 |
|
|
569 |
\@bit; |
|
570 |
} |
|
571 |
@_; |
|
572 |
} |
|
573 |
|
|
574 |
#----------------------------------------------------------------------------- |
|
575 |
# Same as before, except without the pure-number trap. |
|
576 |
|
|
577 |
sub nsorts { |
|
578 |
return @_ if @_ < 2; # Just to be CLEVER. |
|
579 |
|
|
580 |
my($x, $i); # scratch vars |
|
581 |
|
|
582 |
# And now, the GREAT BIG Schwartzian transform: |
|
583 |
|
|
584 |
map |
|
585 |
$_->[0], |
|
586 |
|
|
587 |
sort { |
|
588 |
# Uses $i as the index variable, $x as the result. |
|
589 |
$x = 0; |
|
590 |
$i = 1; |
|
591 |
DEBUG and print "\nComparing ", map("{$_}", @$a), |
|
592 |
' : ', map("{$_}", @$b), , "...\n"; |
|
593 |
|
|
594 |
while($i < @$a and $i < @$b) { |
|
595 |
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ", |
|
596 |
$a->[$i] cmp $b->[$i], "\n"; |
|
597 |
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic |
|
598 |
++$i; |
|
599 |
|
|
600 |
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ", |
|
601 |
$a->[$i] <=> $b->[$i], "\n"; |
|
602 |
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric |
|
603 |
++$i; |
|
604 |
} |
|
605 |
|
|
606 |
DEBUG and print "{$a->[0]} : {$b->[0]} is ", |
|
607 |
$x || (@$a <=> @$b) || 0 |
|
608 |
,"\n" |
|
609 |
; |
|
610 |
$x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]); |
|
611 |
# unless we found a result for $x in the while loop, |
|
612 |
# use length as a tiebreaker, otherwise use cmp |
|
613 |
# on the original string as a fallback tiebreaker. |
|
614 |
} |
|
615 |
|
|
616 |
map { |
|
617 |
my @bit = ($x = defined($_) ? $_ : ''); |
|
618 |
|
|
619 |
while(length $x) { |
|
620 |
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : ''; |
|
621 |
push @bit, ($x =~ s/^(\d+)//s) ? $1 : 0; |
|
622 |
} |
|
623 |
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n"; |
|
624 |
|
|
625 |
# End result: [original bit , (text, number), (text, number), ...] |
|
626 |
# Minimally: [0-length original bit,] |
|
627 |
# Examples: |
|
628 |
# ['10' => '' , 10, ] |
|
629 |
# ['fo900' => 'fo' , 900, ] |
|
630 |
# ['foo10' => 'foo', 10, ] |
|
631 |
# ['foo9.pl' => 'foo', 9, , '.pl', 0 ] |
|
632 |
# ['foo32.pl' => 'foo', 32, , '.pl', 0 ] |
|
633 |
# ['foo325.pl' => 'foo', 325, , '.pl', 0 ] |
|
634 |
# Yes, always an ODD number of elements. |
|
635 |
|
|
636 |
\@bit; |
|
637 |
} |
|
638 |
@_; |
|
639 |
} |
|
640 |
|
|
641 |
#----------------------------------------------------------------------------- |
|
642 |
# Same as before, except for the sort-key-making |
|
643 |
|
|
644 |
sub nsort0 { |
|
645 |
return @_ if @_ < 2; # Just to be CLEVER. |
|
646 |
|
|
647 |
my($x, $i); # scratch vars |
|
648 |
|
|
649 |
# And now, the GREAT BIG Schwartzian transform: |
|
650 |
|
|
651 |
map |
|
652 |
$_->[0], |
|
653 |
|
|
654 |
sort { |
|
655 |
# Uses $i as the index variable, $x as the result. |
|
656 |
$x = 0; |
|
657 |
$i = 1; |
|
658 |
DEBUG and print "\nComparing ", map("{$_}", @$a), |
|
659 |
' : ', map("{$_}", @$b), , "...\n"; |
|
660 |
|
|
661 |
while($i < @$a and $i < @$b) { |
|
662 |
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ", |
|
663 |
$a->[$i] cmp $b->[$i], "\n"; |
|
664 |
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic |
|
665 |
++$i; |
|
666 |
|
|
667 |
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ", |
|
668 |
$a->[$i] <=> $b->[$i], "\n"; |
|
669 |
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric |
|
670 |
++$i; |
|
671 |
} |
|
672 |
|
|
673 |
DEBUG and print "{$a->[0]} : {$b->[0]} is ", |
|
674 |
$x || (@$a <=> @$b) || 0 |
|
675 |
,"\n" |
|
676 |
; |
|
677 |
$x || (@$a <=> @$b) || ($a->[0] cmp $b->[0]); |
|
678 |
# unless we found a result for $x in the while loop, |
|
679 |
# use length as a tiebreaker, otherwise use cmp |
|
680 |
# on the original string as a fallback tiebreaker. |
|
681 |
} |
|
682 |
|
|
683 |
map { |
|
684 |
my @bit = ($x = defined($_) ? $_ : ''); |
|
685 |
|
|
686 |
if($x =~ m/^[+-]?(?=\d|\.\d)\d*(?:\.\d*)?(?:[Ee](?:[+-]?\d+))?\z/s) { |
|
687 |
# It's entirely purely numeric, so treat it specially: |
|
688 |
push @bit, '', $x; |
|
689 |
} else { |
|
690 |
# Consume the string. |
|
691 |
while(length $x) { |
|
692 |
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : ''; |
|
693 |
# Secret sauce: |
|
694 |
if($x =~ s/^(\d+)//s) { |
|
695 |
if(substr($1,0,1) eq '0' and $1 != 0) { |
|
696 |
push @bit, $1 / (10 ** length($1)); |
|
697 |
} else { |
|
698 |
push @bit, $1; |
|
699 |
} |
|
700 |
} else { |
|
701 |
push @bit, 0; |
|
702 |
} |
|
703 |
} |
|
704 |
} |
|
705 |
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n"; |
|
706 |
|
|
707 |
\@bit; |
|
708 |
} |
|
709 |
@_; |
|
710 |
} |
|
711 |
|
|
712 |
#----------------------------------------------------------------------------- |
|
713 |
# Like nsort0, but WITHOUT pure number handling, and WITH special treatment |
|
714 |
# of pulling off extensions and version numbers. |
|
715 |
|
|
716 |
sub nsortf { |
|
717 |
return @_ if @_ < 2; # Just to be CLEVER. |
|
718 |
|
|
719 |
my($x, $i); # scratch vars |
|
720 |
|
|
721 |
# And now, the GREAT BIG Schwartzian transform: |
|
722 |
|
|
723 |
map |
|
724 |
$_->[0], |
|
725 |
|
|
726 |
sort { |
|
727 |
# Uses $i as the index variable, $x as the result. |
|
728 |
$x = 0; |
|
729 |
$i = 3; |
|
730 |
DEBUG and print "\nComparing ", map("{$_}", @$a), |
|
731 |
' : ', map("{$_}", @$b), , "...\n"; |
|
732 |
|
|
733 |
while($i < @$a and $i < @$b) { |
|
734 |
DEBUG and print " comparing $i: {$a->[$i]} cmp {$b->[$i]} => ", |
|
735 |
$a->[$i] cmp $b->[$i], "\n"; |
|
736 |
last if ($x = ($a->[$i] cmp $b->[$i])); # lexicographic |
|
737 |
++$i; |
|
738 |
|
|
739 |
DEBUG and print " comparing $i: {$a->[$i]} <=> {$b->[$i]} => ", |
|
740 |
$a->[$i] <=> $b->[$i], "\n"; |
|
741 |
last if ($x = ($a->[$i] <=> $b->[$i])); # numeric |
|
742 |
++$i; |
|
743 |
} |
|
744 |
|
|
745 |
DEBUG and print "{$a->[0]} : {$b->[0]} is ", |
|
746 |
$x || (@$a <=> @$b) || 0 |
|
747 |
,"\n" |
|
748 |
; |
|
749 |
$x || (@$a <=> @$b ) || ($a->[1] cmp $b->[1]) |
|
750 |
|| ($a->[2] <=> $b->[2]) || ($a->[0] cmp $b->[0]); |
|
751 |
# unless we found a result for $x in the while loop, |
|
752 |
# use length as a tiebreaker, otherwise use the |
|
753 |
# lc'd extension, otherwise the verison, otherwise use |
|
754 |
# the original string as a fallback tiebreaker. |
|
755 |
} |
|
756 |
|
|
757 |
map { |
|
758 |
my @bit = ( ($x = defined($_) ? $_ : ''), '',0 ); |
|
759 |
|
|
760 |
{ |
|
761 |
# Consume the string. |
|
762 |
|
|
763 |
# First, pull off any VAX-style version |
|
764 |
$bit[2] = $1 if $x =~ s/;(\d+)$//; |
|
765 |
|
|
766 |
# Then pull off any apparent extension |
|
767 |
if( $x !~ m/^\.+$/s and # don't mangle ".", "..", or "..." |
|
768 |
$x =~ s/(\.[^\.\;]*)$//sg |
|
769 |
# We could try to avoid catching all-digit extensions, |
|
770 |
# but I think that's getting /too/ clever. |
|
771 |
) { |
|
772 |
$i = $1; |
|
773 |
if($x =~ m<[^\\\://]$>s) { |
|
774 |
# We didn't take the whole basename. |
|
775 |
$bit[1] = lc $i; |
|
776 |
DEBUG and print "Consuming extension \"$1\"\n"; |
|
777 |
} else { |
|
778 |
# We DID take the whole basename. Fix it. |
|
779 |
$x = $1; # Repair it. |
|
780 |
} |
|
781 |
} |
|
782 |
|
|
783 |
push @bit, '', -1 if $x =~ m/^\./s; |
|
784 |
# A hack to make .-initial filenames sort first, regardless of locale. |
|
785 |
# And -1 is always a sort-firster, since in the code below, there's |
|
786 |
# no allowance for filenames containing negative numbers: -1.dat |
|
787 |
# will be read as string '-' followed by number 1. |
|
788 |
|
|
789 |
while(length $x) { |
|
790 |
push @bit, ($x =~ s/^(\D+)//s) ? lc($1) : ''; |
|
791 |
# Secret sauce: |
|
792 |
if($x =~ s/^(\d+)//s) { |
|
793 |
if(substr($1,0,1) eq '0' and $1 != 0) { |
|
794 |
push @bit, $1 / (10 ** length($1)); |
|
795 |
} else { |
|
796 |
push @bit, $1; |
|
797 |
} |
|
798 |
} else { |
|
799 |
push @bit, 0; |
|
800 |
} |
|
801 |
} |
|
802 |
} |
|
803 |
|
|
804 |
DEBUG and print "$bit[0] => ", map("{$_} ", @bit), "\n"; |
|
805 |
|
|
806 |
\@bit; |
|
807 |
} |
|
808 |
@_; |
|
809 |
} |
|
810 |
|
|
811 |
# yowza yowza yowza. |
|
812 |
|
Auch abrufbar als: Unified diff
Module: Fallback-Version von Sort::Naturally entfernt
Ist in Debian/Ubuntu inzwischen paketiert.