lib/Bio/MUST/Core/Ali.pm


            
              1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
—
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
              package Bio::MUST::Core::Ali;
# ABSTRACT: Multiple sequence alignment
# CONTRIBUTOR: Catherine COLSON <ccolson@doct.uliege.be>
# CONTRIBUTOR: Arnaud DI FRANCO <arnaud.difranco@gmail.com>
$Bio::MUST::Core::Ali::VERSION = '0.250380';
use Moose;
use namespace::autoclean;
use autodie;
use feature qw(say);
# use Smart::Comments;
use Carp;
use File::Temp;
use List::AllUtils qw(uniq indexes sum0);
use Path::Class qw(file);
use POSIX qw(ceil floor);
use Statistics::Descriptive;
use Tie::IxHash;
use Bio::MUST::Core::Types;
use Bio::MUST::Core::Constants qw(:ncbi :gaps :files);
use aliased 'Bio::MUST::Core::Seq';
use aliased 'Bio::MUST::Core::SeqId';
use aliased 'Bio::MUST::Core::SeqMask';
# TODO: add information about methods available in Ali-like objects
# ATTRIBUTES
has 'seqs' => (
    traits   => ['Array'],
    is       => 'ro',
    isa      => 'ArrayRef[Bio::MUST::Core::Seq]',
    default  => sub { [] },
    writer   => '_set_seqs',
    handles  => {
          add_seq  => 'push',
          get_seq  => 'get',
          set_seq  => 'set',
       delete_seq  => 'delete',
       insert_seq  => 'insert',
        count_seqs => 'count',
          all_seqs => 'elements',
        first_seq  => 'first',
       filter_seqs => 'grep',
    },
);
has 'file' => (
    is       => 'ro',
    isa      => 'Bio::MUST::Core::Types::File',
    default  => 'untitled.ali',
    coerce   => 1,
    handles  => {
        filename => 'stringify',
    },
);
has 'guessing' => (
    traits   => ['Bool'],
    is       => 'ro',
    isa      => 'Bool',
    default  => 1,
    handles  => {
        dont_guess => 'unset',
        guess      => 'set',
    },
);
with 'Bio::MUST::Core::Roles::Commentable',
     'Bio::MUST::Core::Roles::Listable';
with 'Bio::MUST::Core::Roles::Aliable';     ## no critic (ProhibitMultipleWiths)
# CONSTRUCTORS
sub clone {
    my $self = shift;
    return $self->new(
        comments => [ $self->all_comments ],
        seqs     => [ map { $_->clone } $self->all_seqs ],
        file     => file( $self->filename ),
        guessing => $self->guessing,
    );
}
# ACCESSORS
sub get_seq_with_id {
    my $self = shift;
    my $id   = shift;
    my $seq = $self->first_seq( sub { $_->full_id eq $id } );
    carp "[BMC] Warning: cannot find seq with id: $id; returning undef!"
        unless $seq;
    return $seq;
}
sub all_new_seqs {
    return shift->filter_seqs( sub {     $_->is_new } );
}
sub all_but_new_seqs {
    return shift->filter_seqs( sub { not $_->is_new } );
}
sub all_seq_ids {
    my $self = shift;
    return map { $_->seq_id } $self->all_seqs;
}
# PROPERTIES
sub has_uniq_ids {
    my $self = shift;
    my @ids = map { $_->full_id } $self->all_seq_ids;
    return 1 if $self->count_seqs == uniq @ids;
    return 0;
}
sub is_protein {
    my $self = shift;
    return 1 if List::AllUtils::any { $_->is_protein } $self->all_seqs;
    return 0;
}
sub is_aligned {
    my $self = shift;
    return 0 if not $self->guessing;
    return 1 if List::AllUtils::any { $_->is_aligned } $self->all_seqs;
    return 0;
}
sub width {
    my $self = shift;
    $self->uniformize if $self->is_aligned;     # pad seqs for robustness
    return $self->_max_seq_len;
}
sub _max_seq_len {
    my $self = shift;
    my @lengths = map { $_->seq_len } $self->all_seqs;
    return (List::AllUtils::max @lengths) // 0;     # to avoid warnings
}
sub seq_len_stats {
    my $self = shift;
    my @lengths = map { $_->nomiss_seq_len } $self->all_seqs;
    my $stat = Statistics::Descriptive::Full->new;
       $stat->add_data( \@lengths );
    my @quantiles = map { $stat->quantile($_) } 0..4;
    return @quantiles;
}
sub perc_miss {
    my $self = shift;
    my $n = sum0 map { $_->nomiss_seq_len } $self->all_seqs;
    return 0.0 unless $n;
    my $total = $self->width * $self->count_seqs;
    return 100.0 * ($total - $n) / $total;
}
# MUTATORS
sub uc_seqs {
    my $self = shift;
    $_->uc for $self->all_seqs;
    return $self;
}
sub recode_seqs {                           ## no critic (RequireArgUnpacking)
    my $self = shift;
    $_->recode(@_) for $self->all_seqs;
    return $self;
}
sub degap_seqs {
    my $self = shift;
    $_->degap for $self->all_seqs;
    return $self;
}
sub spacify_seqs {
    my $self = shift;
    $_->spacify for $self->all_seqs;
    return $self;
}
sub gapify_seqs {                           ## no critic (RequireArgUnpacking)
    my $self = shift;
    $_->gapify(@_) for $self->all_seqs;
    return $self;
}
sub trim_seqs {
    my $self = shift;
    $_->trim for $self->all_seqs;
    return $self;
}
sub pad_seqs {
    my $self = shift;
    my $bound = $self->_max_seq_len;
    $_->pad_to($bound) for $self->all_seqs;
    return $self;
}
sub uniformize {
    my $self = shift;
    # TODO: profile code (triggers?) to avoid useless multiple calls
    $self->spacify_seqs;
    $self->trim_seqs;
    $self->pad_seqs;
    return $self;
}
sub clear_new_tags {
    my $self = shift;
    $_->clear_new_tag for $self->all_seqs;
    return $self;
}
sub shorten_ids {                           ## no critic (RequireArgUnpacking)
    return shift->_change_ids_(1, @_);
}
sub restore_ids {                           ## no critic (RequireArgUnpacking)
    return shift->_change_ids_(0, @_);
}
sub _change_ids_ {
    my $self   = shift;
    my $abbr   = shift;
    my $mapper = shift;
    for my $seq ($self->all_seqs) {
        my $new_id = $abbr ? $mapper->abbr_id_for( $seq->full_id )
                           : $mapper->long_id_for( $seq->full_id );
        $seq->set_seq_id($new_id) if $new_id;
    }                       # Note: leave id alone if not found
    return $self;
}
sub apply_list {
    my $self = shift;
    my $list = shift;
    # loop through seqs from bottom to top
    # ... and delete seq from Ali if not listed in list
    my $seq_n = $self->count_seqs;
    for (my $i = $seq_n-1; $i >= 0; $i--) {
        $self->delete_seq($i)
            unless $list->is_listed( $self->get_seq($i)->full_id );
    }
    return $self;
}
sub _premask_check {
    my $self = shift;
    my $mask = shift;
    # warn of unaligned Ali
    carp '[BMC] Note: Ali does not look aligned!'
        . ' This might not be an issue if seqs are ultra-conserved!'
        unless $self->is_aligned;
    $self->uniformize;
    # warn of empty SeqMask
    carp '[BMC] Note: applying this mask will result in a zero-width Ali!'
        unless List::AllUtils::any { $_ } $mask->all_states;
    # check that SeqMask is compatible with Ali
    # potential bugs could come from constant sites etc
    my $a_width = $self->width;
    my $m_width = $mask->mask_len;
    carp "[BMC] Note: Ali width does not match mask len: $a_width vs. $m_width!"
        . ' This might not be an issue if the mask results from blocks.'
        unless $a_width == $m_width;
    return;                 # return values of before subs are ignored
}
before 'apply_mask' => \&_premask_check;
sub apply_mask {
    my $self = shift;
    my $mask = shift;
    # select sites for each seq using a precomputed array slice
    my @indexes = indexes { $_ } $mask->all_states;
    $_->_set_seq( join q{}, ( $_->all_states )[@indexes] )
        for $self->all_seqs;
    return $self;
}
sub idealize {                              ## no critic (RequireArgUnpacking)
    my $self = shift;
    return $self->apply_mask( SeqMask->ideal_mask($self, @_) );
}
# MISC METHODS
# TODO: introduce miss_regex (without gaps?)
sub gapmiss_regex {
    return shift->is_protein ? $GAPPROTMISS : $GAPDNAMISS;
}
sub map_coords {
    my $self      = shift;
    my $id        = shift;
    my $coords_in = shift;
    my $seq = $self->get_seq_with_id($id);
    my @states = $seq->all_states;
    my $count = 0;
    my @index;
    for my $state (@states) {
        $count++ if $state !~ $GAP;
        push @index, $count;
    }
    my @coords_out = map { $index[$_-1] } @{$coords_in};
    return \@coords_out;
}
# ALIASES
sub height {
    return shift->count_seqs;
}
# I/O METHODS
sub load {
    my $class  = shift;
    my $infile = shift;
    open my $in, '<', $infile;
    my $ali = $class->new( file => $infile );
    my $seq_id;
    my $seq;
    LINE:
    while (my $line = <$in>) {
        chomp $line;
        # skip empty lines and process comments
        next LINE if $line =~ $EMPTY_LINE
                  || $ali->is_comment($line);
        # at each '>' char...
        my ($defline) = $line =~ $DEF_LINE;
        if ($defline) {
            # add current seq to ali (if any)
            if ($seq) {
                my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
                $ali->add_seq($new_seq);
                $seq = q{};
            }
            $seq_id = $defline;
            next LINE;
        }
        # elongate current seq (seqs can be broken on several lines)
        $seq .= $line;
    }
    # add last seq to ali (if any)
    if ($seq) {
        my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
        $ali->add_seq($new_seq);
    }
    return $ali;
}
before qr{\Astore}xms => sub {
    my $self = shift;
    # perform pre-storage duties
    carp '[BMC] Warning: non unique seq ids!' unless $self->has_uniq_ids;
    $self->uniformize if $self->is_aligned;
    return;
};
sub store {                                 ## no critic (RequireArgUnpacking)
    my $self = shift;
    my $outfile = shift;
    # automatically redirect to store_fasta for non-Ali outfile names
    return $self->store_fasta($outfile, @_)     # note the currying
        unless $outfile =~ $ALI_SUFFIX;
    open my $out, '>', $outfile;
    print {$out} $self->header;
    for my $seq ($self->all_seqs) {
        say {$out} '>' . $seq->full_id;
        say {$out} $seq->seq;
    }
    return;
}
sub store_fasta {
    my $self    = shift;
    my $outfile = shift;
    my $args    = shift // {};          # HashRef (should not be empty...)
    my $degap = $args->{degap}  //  0;
    my $clean = $args->{clean}  //  0;
    my $gap   = $args->{gapify} // 'X';
    my $chunk = $args->{chunk}  // 60;
    my $nowrap = $chunk < 0 ? 1 : 0;
    my $is_aligned = $self->is_aligned;
    open my $out, '>', $outfile;
    for my $seq ($self->all_seqs) {
        say {$out} '>' . $seq->foreign_id;
        # optionally clean and/or degap seq
        $seq = $seq->clone  if $clean || $degap;    # clone seq only if needed
        $seq->gapify($gap)  if $clean;
        $seq->degap         if $degap;
        my $width = $seq->seq_len;
        $chunk = $width     if $nowrap;             # optionally disable wrap
        my $str = $seq->wrapped_str($chunk);
        $str =~ s{$GAP}{-}xmsg if $is_aligned;      # restore '-' when aligned
        print {$out} $str;
    }
    return;
}
sub temp_fasta {
    my $self = shift;
    my $args = shift // {};             # HashRef (should not be empty...)
    # abbreviate ids (possibly using a custom prefix, offset, or IdMapper)
    my $mapper = $args->{mapper} // $self->std_mapper($args);
    $self->shorten_ids($mapper);
    # write temporary .fasta file using standard abbr_ids
    # ...and restore long_ids afterwards
    my $out = File::Temp->new(UNLINK => 0, EXLOCK => 0, SUFFIX => '.fasta');
    $self->store_fasta($out->filename, $args);
    $self->restore_ids($mapper);
    ### filename: $out->filename
    return wantarray ? ($out->filename, $mapper) : $out->filename;
}
# TODO: add an option to remove trailing '_' in ids?
sub load_phylip {
    my $class  = shift;
    my $infile = shift;
    open my $in, '<', $infile;
    my $ali = $class->new( file => $infile );
    my $seq_n;
    my $site_n;
    my $n = 0;
    LINE:
    while (my $line = <$in>) {
        chomp $line;
        # skip empty lines
        next LINE if $line =~ $EMPTY_LINE;
        # get matrix dimensions
        if ($line =~ $DIM_LINE) {
            ($seq_n, $site_n) = ($1, $2);
            next LINE;
        }
        # process regular line (identifier is optional)
        my ($seq_id, $seq) = $line =~ $PHY_LINE;
        # should never happen...
        croak "[BMC] Error: unable to parse PHYLIP file at line $.; aborting!"
            unless $seq;
        # delete optional spaces
        $seq =~ tr/ //d;
        # process first seq block
        if ($ali->count_seqs < $seq_n) {
            # seq_id are mandatory in first block
            croak "[BMC] Error: missing id in PHILIP file at line $.; aborting!"
                unless $seq_id;
            # store first seq chunk along with seq_id
            my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
            $ali->add_seq($new_seq);
        }
        # process remaining seq blocks
        else {
            my $curr_seq = $ali->get_seq($n);
            # elongate current with partial seq chunk mistaken as seq_id
            if ($seq_id && $seq_id ne $curr_seq->full_id) {
                $curr_seq->append_seq($seq_id);
            }
            # elongate current seq with new seq chunk
            $curr_seq->append_seq($seq);
        }
        # prepare reading of next seq or next block
        $n = 0 if ++$n == $seq_n;
    }
    my $width = $ali->width;
    croak "[BMC] Error: unexpected site number in PHYLIP file: $width;"
        . ' aborting!' if $width != $site_n;
    return $ali;
}
# PHYLIP: http://evolution.genetics.washington.edu/phylip/doc/main.html
# The information for each species follows, starting with a ten-character
# species name (which can include blanks and some punctuation marks), and
# continuing with the characters for that species. The name should be on the
# same line as the first character of the data for that species. (...) The
# name should be ten characters in length, filled out to the full ten
# characters by blanks if shorter. Any printable ASCII/ISO character is
# allowed in the name, except for parentheses ("(" and ")"), square brackets
# ("[" and "]"), colon (":"), semicolon (";") and comma (","). (...) Note that
# in these sequences we have a blank every ten sites to make them easier to
# read: any such blanks are allowed. The blank line which separates the two
# groups of lines (the ones containing sites 1-20 and ones containing sites
# 21-39) may or may not be present.
# PhyML: http://www.atgc-montpellier.fr/phyml/usersguide.php?type=command
# The input sequence file is a standard PHYLIP file of aligned DNA or
# amino-acids sequences. (...) The maximum number of characters in species
# name MUST not exceed 100. Blanks and the symbols "(),:" are not allowed
# within sequence names because the Newick tree format makes special use of
# these symbols. However, blanks (one or more) MUST appear at the end of each
# species name.
# TREE-FINDER: http://www.treefinder.de/tf-march2011-manual.pdf
# A sequence name may consist of 1 to 10 alphanumeric characters, dashes "-",
# dots ".", underscores "_", or some of "/", "?", "*", "+". No space is
# allowed inside the names. The first name character must be a letter or an
# underscore. A sequence fragment may start behind position 10 after a
# sequence name, or anywhere in a line without a name.
# TREE-PUZZLE [data/globin.a]
#   7 128
# HBB_HUMAN      HLTPEEKSAV TALWGKVNVD EVGGEALGRL LVVYPWTQRF FESFDLSMGN
# HBB_HORSE      QLSGEEKAAV LALWDKVNEE EVGGEALGRL LVVYPWTQRF FDSFDLSMGN
# HBA_HUMAN      VLSPADKTNV KAAWGKVGAG EYGAEALERM FLSFPTTKTY FPHFDLSHGS
# HBA_HORSE      VLSAADKTNV KAAWSKVGAG EYGAEALERM FLGFPTTKTY FPHFDLSHGS
# MYG_PHYCA      VLSEGEWQLV LHVWAKVEVA GHGQDILIRL FKSHPETLEK FDRFHLKKAS
# GLB5_PETMA     PLSAAEKTKI RSAWAPVYYE TSGVDILVKF FTSTPAAQEF FPKFGLTKKS
# LGB2_LUPLU     ALTESQAALV KSSWEEFNIP KHTHRFFILV LEIAPAAKDL FSFLGTSQNN
# RAXML: http://sco.h-its.org/exelixis/oldPage/RAxML-Manual.7.0.4.pdf
# The input alignment format of RAxML is relaxed interleaved or sequential
# PHYLIP. "Relaxed" means that sequence names can be of variable length
# between 1 up to 256 characters. (...) Prohibited Character(s) in taxon names
# taxon names that contain any form of whitespace character, like blanks,
# tabulators, and carriage returns, as well as one of the following prohibited
# characters: :,();[].
# PHYLOBAYES: http://megasun.bch.umontreal.ca/People/lartillot/www/phylobayes3.3e.pdf
# Taxon names may contain more than 10 characters. Avoid special characters
# such as ';' or ')', which will create problems when parsing trees. The best
# is to only use letters, digits and '_'. Sequences can be interrupted by
# space and tab, but not by return characters. Be sure that the lengths of the
# sequences are all the same, and identical to the lengths indicated in the
# header. Sequences can be interleaved, in which case the taxon names may or
# may not be repeated in each block.
sub store_phylip {
    my $self    = shift;
    my $outfile = shift;
    my $args    = shift // {};          # HashRef (should not be empty...)
    my $short = $args->{short} // 1;
    my $clean = $args->{clean} // 0;
    my $chunk = $args->{chunk} // 60;
    open my $out, '>', $outfile;
    # print data matrix dimensions
    my $height = $self->count_seqs;
    my $width  = $self->width;
    say {$out} $height . q{ } . $width;
    # setup id format (this will also affect block-like structure)
    my $format = $short ? "%-10.10s %s\n" : "%s %s\n";
    my $method = $short ? 'full_id'       : 'foreign_id';
    my $sep    = $short ? q{ }            : q{};
    # optionally disable wrapping
    $chunk = $width if $chunk < 0;
    # optionally clean seq
    my @seqs = $self->all_seqs;
       @seqs = map { $_->clone->gapify('X') } @seqs if $clean;
    # output Ali in sequential or interleaved format
    for (my $site = 0; $site < $width; $site += $chunk) {
        # leave empty line between chunks (but not after last chunk)
        print {$out} "\n" if $site;
        for my $seq (@seqs) {
            # print 10-chars ids only once
            # Note: full_ids are only truncated (use IdMapper for more)
            my $id = $site == 0 ? $seq->$method : q{};
            # print seq chunks in 10-state blocks
            # Note: We insert a space in the 11th column to ensure that more
            # software packages can read the written files. PHYLIP itself will
            # ignore this spurious space in the 'sequences'.
            ( my $str = $seq->edit_seq($site, $chunk) ) =~ s{$GAP}{-}xmsg;
            printf {$out} $format, $id, join $sep,
                map { substr($str, $_*10, 10) } 0..ceil(length($str)/10)-1;
        }
    }
    return;
}
sub load_stockholm {
    my $class  = shift;
    my $infile = shift;
    open my $in, '<', $infile;
    my $ali = $class->new( file => $infile );
    tie my %seq_for, 'Tie::IxHash';
    LINE:
    while (my $line = <$in>) {
        chomp $line;
        # process GF comments
        next LINE if $ali->is_comment($line, $STK_COMMENT);
        # skip empty lines and other comment lines
        next LINE if $line =~ $EMPTY_LINE
                  || $line =~ $COMMENT_LINE;
        last LINE if $line =~ $END_LINE;
        # parse stockholm seq
        my ($seq_id, $seq) = $line =~ $STK_SEQ;
        # replace dots and gaps by stars
        $seq =~ s/[\.\-]/*/xmsg;
        # elongate seq
        $seq_for{$seq_id} .= $seq;
    }
    # populate Ali object
    while (my ($seq_id, $seq) = each %seq_for) {
        my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
        $ali->add_seq($new_seq);
    }
    return $ali;
}
# <TSeq>
#  <TSeq_seqtype value="nucleotide"/>
#  <TSeq_gi>160476623</TSeq_gi>
#  <TSeq_accver>EY249574.1</TSeq_accver>
#  <TSeq_sid>gnl|dbEST|50783737</TSeq_sid>
#  <TSeq_taxid>5061</TSeq_taxid>
#  <TSeq_orgname>Aspergillus niger</TSeq_orgname>
#  <TSeq_defline>CATY7117.fwd CATY Aspergillus niger fungal ...</TSeq_defline>
#  <TSeq_length>906</TSeq_length>
#  <TSeq_sequence>ACGT...</TSeq_sequence>
# </TSeq>
sub load_tinyseq {
    my $class  = shift;
    my $infile = shift;
    my $args   = shift // {};           # HashRef (should not be empty...)
    my $kps = $args->{keep_strain} // 0;
    open my $in, '<', $infile;
    my $ali = $class->new( file => $infile );
    my ($acc, $tax, $org, $seq);
    # Note: crude parser derived from old tseq2ali.pl
    while (my $line = <$in>) {
        chomp $line;
        # capture attributes
        if      ($line =~ m{<TSeq_accver>      (.*?) </TSeq_accver>  }xms) {
            $acc = $1;
        } elsif ($line =~ m{<TSeq_taxid> ($NCBIPKEY) </TSeq_taxid>   }xms) {
            $tax = $1;
        } elsif ($line =~ m{<TSeq_orgname>  (.*?)    </TSeq_orgname> }xms) {
            $org = $1;
        } elsif ($line =~ m{<TSeq_sequence> (.*?)    </TSeq_sequence>}xms) {
            $seq = $1;
        }
        # process current seq (if any)
        if ($seq) {
            # build full_id from NCBI org
            my $seq_id = SeqId->new_with(
                org         => $org,
                taxon_id    => $tax,
                accession   => $acc,
                keep_strain => $kps,
            );
            # add current seq to ali
            my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
            $ali->add_seq($new_seq);
            # clear attributes for next seq
            ($acc, $tax, $org, $seq) = () x 4;
        }
    }
    return $ali;
}
sub instant_store {
    my $class   = shift;
    my $outfile = shift;
    my $args    = shift // {};          # HashRef (should not be empty...)
    my $infile  = $args->{infile};
    croak '[BMC] Error: no infile specified for instant_store; aborting!'
        unless $infile;
    my $coderef = $args->{coderef};
    croak '[BMC] Error: no coderef specified for instant_store; aborting!'
        unless $coderef;
    open my $in,  '<', $infile;
    open my $out, '>', $outfile;
    my $seq_id;
    my $seq;
    LINE:
    while (my $line = <$in>) {
        chomp $line;
        # skip empty lines and process comments
        next LINE if $line =~ $EMPTY_LINE
                  || $line =~ $COMMENT_LINE;
        # at each '>' char...
        my ($defline) = $line =~ $DEF_LINE;
        if ($defline) {
            # process current seq (if any)
            if ($seq) {
                my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
                print {$out} $coderef->($new_seq);
                $seq = q{};
            }
            $seq_id = $defline;
            next LINE;
        }
        # elongate current seq (seqs can be broken on several lines)
        $seq .= $line;
    }
    # process last seq (if any)
    if ($seq) {
        my $new_seq = Seq->new( seq_id => $seq_id, seq => $seq );
        print {$out} $coderef->($new_seq);
    }
    return;
}
sub instant_count {
    my $class  = shift;
    my $infile = shift;
    my $seq_n = 0;
    open my $in, '<', $infile;
    while (<$in>) { $seq_n++ if m/^>/xms }
    return $seq_n;
}
__PACKAGE__->meta->make_immutable;
1;
__END__
=pod
=head1 NAME
Bio::MUST::Core::Ali - Multiple sequence alignment
=head1 VERSION
version 0.250380
=head1 SYNOPSIS
    #!/usr/bin/env perl
    use Modern::Perl '2011';
    # same as:
    # use strict;
    # use warnings;
    # use feature qw(say);
    use Bio::MUST::Core;
    use aliased 'Bio::MUST::Core::Ali';
    # read Ali form disk
    my $ali = Ali->load('example.ali');
    # get some properties
    say 'height:  ' . $ali->height;         # number of seqs
    say 'width:   ' . $ali->width;          # number of sites
    say '% miss:  ' . $ali->perc_miss;      # fraction of missing chars (%)
    say 'seqs are ' . $ali->is_protein ? 'prot' : 'nucl';
    # turn seqs to uppercase
    $ali->uc_seqs;
    # filter out seqs with no species associated
    my @seqs = $ali->filter_seqs( sub { not $_->is_genus_only } );
    use aliased 'Bio::MUST::Core::IdList';
    my $list = IdList->new( ids => \@seqs );
    $ali->apply_list($list);
    # alternatively:
    # $ali = Ali->new( seqs => \@seqs );
    # filter out gap-rich sites
    $ali->idealize(0.2);                    # min 20% non-gaps per site
    # filter out non-informative sites
    use aliased 'Bio::MUST::Core::SeqMask';
    my $mask = SeqMask->parsimony_mask($ali);
    $ali->apply_mask($mask);
    # write down reduced Ali to disk
    $ali->store('example-uc-genus-sp-20.ali');
    $ali->store_fasta('example-uc-genus-sp-20.fasta');
    # see below for additional methods
    # ...
=head1 DESCRIPTION
This module implements the multiple sequence alignment (MSA) class and its
methods. An Ali is modeled as an array of L<Bio::MUST::Core::Seq> objects.
Consequently, sequence ids do not absolutely need to be unique for it to
work (though id uniqueness helps a lot for sequence access and filtering).
An Ali knows whether it contains nucleotide or protein sequences, whether
those are aligned or not, as well as its Seq count and exact width. All
these properties are introspected on the fly, which means that, while they
can be expensive to compute, they are always accurate.
This is important because an Ali provides methods for inserting, deleting
and editing its Seq objects. Further, it does its best to maintain a semantic
distinction between true gap states (encoded as '*') and missing regions of
undetermined length (encoded as pure whitespace, for example at the end of a
short sequence).
It also has methods for mapping its sequence ids (for example, before export
to the PHYLIP format), as well as methods to selectively retain sequences
and sites based on L<Bio::MUST::Core::IdList> and
L<Bio::MUST::Core::SeqMask> objects. For example, the C<idealize> method
discards shared gaps and optionally removes gap-rich sites only due to a
tiny number of sequences.
Finally, an Ali can be stored in MUST pseudo-FASTA format (which handles
meaningful whitespace and allows comment lines in the header) or be
imported/exported from/to several popular MSA formats, such as plain FASTA,
STOCKHOLM and PHYLIP.
=head1 ATTRIBUTES
=head2 seqs
ArrayRef of L<Bio::MUST::Core::Seq> objects (optional)
Most of the accessor methods described below are implemented by delegation
to this public attribute using L<Moose::Meta::Attribute::Native/Moose native
delegation>. Their documentation thus heavily borrows from the corresponding
help pages.
=head2 file
L<Path::Class::File> object (optional)
This optional attribute is initialized by class methods that C<load> an Ali
from disk. It is meant to improve the introspection capabilities of the Ali.
For now, this attribute is not used by the C<store> methods, though it might
provide them with a default value in the future.
=head2 guessing
Boolean (optional)
By default, an Ali object tries to guess whether it is aligned or not by
looking for gap-like characters in any of its Seq objects (see
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence).
When this smart behavior causes issues, one can disable it by unsetting this
boolean attribute (see C<dont_guess> and C<guess> accessor methods).
=head2 comments
ArrayRef of strings (optional)
An Ali object is commentable, which means that it supports all the methods
pertaining to comment lines described in
L<Bio::MUST::Core::Roles::Commentable> (such as C<header>).
=head1 CONSTRUCTORS
=head2 new
Default constructor (class method) returning a new Ali.
    use aliased 'Bio::MUST::Core::Ali';
    my $ali1 = Ali->new();
    my @seqs = $ali->all_seqs;
    my $ali2 = Ali->new( seqs => \@seqs );
This method accepts four optional arguments (see ATTRIBUTES above): C<seqs>,
C<file>, C<guessing> and C<comments>.
=head2 clone
Creates a deep copy (a clone) of the Ali. Returns the copy.
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('input.ali');
    my $ali_copy = $ali->clone;
    # you can now mess with $ali_copy without affecting $ali
This method does not accept any arguments.
=head1 ACCESSORS
=head2 add_seq
Adds one (or more) new sequence(s) at the end of the Ali. Returns the new
number of sequences of the Ali.
    use aliased 'Bio::MUST::Core::Seq';
    my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
    $ali->add_seq($new_seq);
This method accepts any number of arguments.
=head2 get_seq
Returns a sequence of the Ali by its index. You can also use negative index
numbers, just as with Perl's core array handling. If the specified sequence
does not exist, this method will return C<undef>.
    my $seq = $ali->get_seq($index);
    croak "Seq $index not found in Ali!" unless defined $seq;
This method accepts just one argument (and not an array slice).
=head2 get_seq_with_id
Returns a sequence of the Ali by its id. If the specified id is not unique,
only the first matching sequence will be returned, whereas if no sequence
exists for the specified id, this method will return C<undef>.
    my $id = 'Pyrus malus_3750@658052655';
    my $seq = $ali->get_seq_with_id($id);
    croak "Seq $id not found in Ali!" unless defined $seq;
This method accepts just one argument.
=head2 set_seq
Given an index and a sequence, sets the specified Ali element to the
sequence. This method returns the new sequence at C<$index>.
    use aliased 'Bio::MUST::Core::Seq';
    my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
    $ali->set_seq($index, $new_seq);
This method requires two arguments.
=head2 delete_seq
Removes the sequence at the given index from the Ali. This method returns
the deleted sequence. If the specified sequence does not exist, it will
return C<undef> instead.
    $ali->delete_seq($index);
This method requires one argument.
=head2 insert_seq
Inserts a new sequence into the Ali at the given index. This method returns
the new sequence at C<$index>.
    use aliased 'Bio::MUST::Core::Seq';
    my $new_seq = Seq->new( seq_id => 'seq1', seq => 'ACGT' );
    $ali->insert_seq($index, $new_seq);
This method requires two arguments.
=head2 all_seqs
Returns all the sequences of the Ali (not an array reference).
    my @seqs = $ali->all_seqs;
This method does not accept any arguments.
=head2 first_seq
Returns the first sequence of the Ali matching a given criterion, just like
L<List::Util>'s C<first> function. This method requires a subroutine
implementing the matching logic.
    # emulate get_seq_with_id method
    my $id2find = 'seq13';
    my $seq = $ali->first_seq( sub { $_->full_id eq $id2find } );
This method requires a single argument.
=head2 filter_seqs
Returns every sequence of the Ali matching a given criterion, just like
Perl's core C<grep> function. This method requires a subroutine implementing
the matching logic.
    # keep only long sequences (ignoring gaps and missing states)
    my @long_seqs = $ali->filter_seqs( sub { $_->nomiss_seq_len > 500 } );
This method requires a single argument.
=head2 all_new_seqs
Returns all the sequences of the Ali tagged as #NEW# (not an array reference).
    my @new_seqs = $ali->all_new_seqs;
This method does not accept any arguments.
=head2 all_but_new_seqs
Returns all the sequences of the Ali except those tagged as #NEW# (not an array
reference).
    my @preexisting_seqs = $ali->all_but_new_seqs;
This method does not accept any arguments.
=head2 all_seq_ids
Returns all the sequence ids (L<Bio::MUST::Core::SeqId> objects) of the Ali
(not an array reference). This is only a convenience method.
    use Test::Deeply;
    my @ids1 = $ali->all_seq_ids;
    my @ids2 = map { $_->seq_id } $ali->all_seqs;
    is_deeply \@ids1, \@ids2, 'should be true';
    my @orgs = map { $_->org } @ids1;
This method does not accept any arguments.
=head2 filename
Returns the stringified filename of the Ali.
This method does not accept any arguments.
=head2 guess
Turn on the smart detection of gaps (see C<guessing> attribute above).
This method does not accept any arguments.
=head2 dont_guess
Turn off the smart detection of gaps (see C<guessing> attribute above).
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('ensembl.fasta');
    $ali->dont_guess;
This method does not accept any arguments.
=head1 PROPERTIES
=head2 has_uniq_ids
Returns true if all the sequence ids are unique.
    carp 'Warning: duplicate sequence ids!' unless $ali->has_uniq_ids;
This method does not accept any arguments.
=head2 is_protein
Returns true if any sequence of the Ali looks like a protein. See
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
    say 'Your file includes nucleotide sequences' unless $ali->is_protein;
This method does not accept any arguments.
=head2 is_aligned
Returns true if any sequence of the Ali appears to be aligned. See
L<Bio::MUST::Core::Seq> for the exact test performed on each sequence.
If the boolean attribute guessing is not set, always returns false.
    carp 'Warning: file does not look aligned!' unless $ali->is_aligned;
This method does not accept any arguments.
=head2 count_seqs
Returns the number of sequences of the Ali. The alias method C<height> is
provided for convenience.
    my $height = $ali->count_seqs;
This method does not accept any arguments.
=head2 width
Returns the width of the Ali (in characters). If the Ali is not aligned,
returns the length of the longest sequence instead.
To avoid potential bugs due to caching, this method dynamically computes the
Ali width at each call. Moreover, the Ali is always uniformized (see below)
beforehands to ensure accurate width value. Therefore, this method is
expensive and should not be called repeatedly (e.g., in a loop condition).
    # you'd better looping through sites like this...
    my $width = $ali->width;
    for my $site (0..$width-1) {
        ...
    }
This method does not accept any arguments.
=head2 seq_len_stats
Returns a list of 5 values summarizing the Ali seq lengths (ignoring gaps).
The values are the following: Q0 (min), Q1, Q2 (median), Q3, and Q4 (max).
This method does not accept any arguments.
=head2 perc_miss
Returns the percentage of missing (and gap-like) character states in the Ali.
As this method internally calls C<Ali::width>, the remarks above also apply.
    my $miss_level = $ali->perc_miss;
This method does not accept any arguments.
=head1 MUTATORS
=head2 uc_seqs
Turn all the sequences of the Ali to uppercase and returns it.
This method does not accept any arguments.
=head2 recode_seqs
Recode all the sequences of the Ali and returns it.
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('biased.ali');
    # set up RY recoding for suppressing codon bias
    my %base_for = (
        A => 'A',   G => 'A',       # purines
        C => 'C',   T => 'C',       # pyrimidines
    );
    my $ali_rec = $ali->recode_seqs( \%base_for );
    $ali_rec->store('biased_ry.ali');
This method requires one argument.
=head2 degap_seqs
Remove the gaps in all the sequences of the Ali and returns it.
This method does not accept any arguments.
=head2 spacify_seqs
Spacifies all the sequences of the Ali and returns it. See the corresponding
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
operation.
This method does not accept any arguments.
=head2 gapify_seqs
Gapifies all the sequences of the Ali and returns it. See the corresponding
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
operation.
This method accepts an optional argument.
=head2 trim_seqs
Trims all the sequences of the Ali and returns it. See the corresponding
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
operation.
This method does not accept any arguments.
=head2 pad_seqs
Pads all the sequences of the Ali and returns it. See the corresponding
method in L<Bio::MUST::Core::Seq> for the exact effect of this gap-cleaning
operation.
This method does not accept any arguments.
=head2 uniformize
Performs the three gap-cleaning operations in turn on all the sequences of
the Ali and returns it, which ensures that it is semantically clean and
rectangular.
This is only a convenience method called internally by the Ali object before
selected methods (such as storage-like methods). However, it might prove
useful in some circumstances, hence it is not defined as private.
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('input.ali');
    $ali->add_seq(...);
    # more editing of the Ali sequences
    $ali->uniformize;
This method does not accept any arguments.
=head2 clear_new_tags
Clear the #NEW# tag (if any) from all the sequences of the Ali and returns it.
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('input-42.ali');
    $ali->clear_new_tags;
    my @new_seqs = $ali->all_new_seqs;
    # array should be empty
This method does not accept any arguments.
=head2 shorten_ids
Replaces all the sequence ids of the Ali by their abbreviated forms as
specified by the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
Note that this method will work only if the sequence ids have not been
already shortened or modified in any way since the creation of the IdMapper.
Long ids without abbreviated forms in the IdMapper are left untouched.
    use aliased 'Bio::MUST::Core::Ali';
    use aliased 'Bio::MUST::Core::IdMapper';
    my $ali = Ali->load('input.ali');
    my $mapper = IdMapper->std_mapper( $ali, { id_prefix => 'lcl|seq' } );
    $ali->shorten_ids($mapper);
    $ali->store_fasta('input.4blast.fasta');
    # makeblastdb
    # Note: the temp_fasta method does exactly that
This method requires one argument.
=head2 restore_ids
Replaces all the sequence ids of the Ali by their long forms as specified by
the passed L<Bio::MUST::Core::IdMapper> and returns the Ali.
Note that this method will work only if the sequence ids have been previously
abbreviated (see above) and have not been modified in any way since then.
Again, abbreviated ids without long forms in the IdMapper are left untouched.
    use aliased 'Bio::MUST::Core::IdMapper';
    my $mapper = IdMapper->gi_mapper($ali);
    $ali->shorten_ids($mapper);
    ...
    $ali->restore_ids($mapper);
This method requires one argument.
=head2 apply_list
Selectively retains or discards sequences from the Ali based on the content
of the passed L<Bio::MUST::Core::IdList> object and returns the Ali.
    use aliased 'Bio::MUST::Core::IdList';
    my $list = IdList->load('interesting_seqs.idl');
    $ali->apply_list($list);                # discard non-interesting seqs
This method requires one argument.
=head2 apply_mask
Selectively retains or discards sites from the Ali based on the content of
the passed L<Bio::MUST::Core::SeqMask> object and returns the Ali.
    use aliased 'Bio::MUST::Core::SeqMask';
    my $variable_sites = SeqMask->variable_mask($ali);
    $ali->apply_mask($variable_sites);      # discard constant sites
This method requires one argument.
=head2 idealize
Computes and applies an ideal sequence mask to the Ali and returns it. This
is only a convenience method.
When invoked without arguments, it will discard the gaps that are
universally shared by all the sequences. Otherwise, the provided argument
corresponds to the threshold of the C<ideal_mask> method described in
L<Bio::MUST::Core::SeqMask>.
    use aliased 'Bio::MUST::Core::IdList';
    my $fast_seqs = IdList->load('fast_evolving_seqs.idl');
    my $seqs2keep = $fast_seqs->negative_list($ali);
    $ali->apply_list($seqs2keep);           # discard fast-evolving seqs
    $ali->idealize;      # discard newly shared gaps caused by fast seqs
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('hmm_based.ali');
    $ali->idealize(0.05);    # discard insertions due to <5% of the seqs
This method accepts an optional argument.
=head1 MISC METHODS
=head2 gapmiss_regex
Returns a regular expression matching gaps and ambiguous or missing states.
The exact regex returned depends on the type of sequences in the Ali (nucl. or
proteins).
    my $regex = $ali->gapmiss_regex;
    my $first_seq = $ali->get_seq(0)->seq;
    my $gapmiss_n = $first_seq =~ m/($regex)/xmsg;
    say "The first sequence has $gapmiss_n gaps or ambiguous/missing sites";
This method does not accept any arguments.
=head2 map_coords
Converts a set of site positions from Ali coordinates to coordinates of the
specified sequence (thereby ignoring positions due to gaps). Returns the
converted sites in sequence coordinates as an array refrence.
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('input.ali');
    my $id = 'GIV-Norovirus Hum.GIV.1.POL_1338688@508124125';
    my $ali_coords = [ 4, 25, 73, 89, 104, 116 ];
    my $seq_coords = $ali->map_coords($id, $ali_coords);
    # $seq_coords is [ 3, 23, 59, 71,  71,  74 ]
This method requires two arguments: the id of a sequence and an array
reference of input sites in Ali coordinates.
=head1 I/O METHODS
=head2 load
Class method (constructor) returning a new Ali read from disk. This method
will transparently import plain FASTA files in addition to the MUST
pseudo-FASTA format (ALI files).
    use Test::Deeply;
    use aliased 'Bio::MUST::Core::Ali';
    my $ali1 = Ali->load('example.ali');
    my $ali2 = Ali->load('example.fasta');
    my @seqs1 = $ali1->all_seqs;
    my @seqs2 = $ali2->all_seqs;
    is_deeply, \@seqs1, \@seqs2, 'should be true';
This method requires one argument.
=head2 store
Writes the Ali to disk in the MUST pseudo-FASTA format (ALI files).
Note that the ALI format is only used when the suffix of the outfile name is
'.ali'. In all other cases (including lack of suffix), this method
automatically forwards the call to C<store_fasta>.
    $ali->store('output.ali');
    # output.ali is written in ALI format
    $ali->store('output.fasta');
    # output.fasta is written in FASTA format
This method requires one argument (but see C<store_fasta> in case of automatic
forwarding of the method call).
=head2 store_fasta
Writes the Ali to disk in the plain FASTA format.
For compatibility purposes, this method automatically fetches sequence ids
using the C<foreign_id> method instead of the native C<full_id> method, both
described in L<Bio::MUST::Core::SeqId>.
    $ali->store_fasta( 'output.fasta' );
    $ali->store_fasta( 'output.fasta', {chunk => -1, degap => 1} );
This method requires one argument and accepts a second optional argument
controlling the output format. It is a hash reference that may contain one
or more of the following keys:
    - clean: replace all ambiguous and missing states by C<X> (default: false)
    - degap: boolean value controlling degapping (default: false)
    - chunk: line width (default is 60 chars; negative values means no wrap)
Finally, it is possible to fine-tune the behavior of the C<clean> option by
providing another character than C<X> through the C<gapify> key. This can be
useful to replace all ambiguous and missing states by gaps, as shown below:
    $ali->store_fasta( 'output.fasta, { clean => 1, gapify => '*' } );
=head2 temp_fasta
Writes a temporary copy of the Ali to disk in the plain FASTA format using
numeric sequence ids and returns the name of the temporary file. This is
only a convenience method.
In list context, returns the IdMapper object along with temporary filename.
    my $infile = $ali->temp_fasta( { degap => 1 } );
    my $output = `script.sh $infile`;
    ...
This method accepts the same optional argument hash as C<store_fasta>.
However, an additional option (C<id_prefix>) is available to control the way
abbreviated sequence ids are prefixed by the C<std_mapper> method (see
L<Bio::MUST::Core::Listable>).
    my $infile1 = $ali1->temp_fasta( { id_prefix => 'file1-' } );
    my $infile2 = $ali2->temp_fasta( { id_prefix => 'file2-' } );
Finally, it is possible to pass an existing L<Bio::MUST::Core::IdMapper> object
by providing the C<mapper> option.
=head2 load_phylip
Class method (constructor) returning a new Ali read from a file in the
PHYLIP format. Both sequential and interleaved formats are supported.
The only constraint is that sequence ids MUST NOT contain whitespace and be
followed by at least one whitespace character. This means that some old-school
PHYLIP files not following this convention will not be processed correctly.
When using the interleaved format, sequence ids may or may not be repeated in
each block.
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load_phylip('phylip.phy');
    say $ali->count_seqs;
    say $ali->width;
    # outputs:
    # 10
    # 709
This method requires one argument.
=head2 store_phylip
Writes the Ali to disk in the interleaved (or sequential) PHYLIP format.
To ensure maximal flexibility, this method fetches sequence ids using the
native C<full_id> method described in L<Bio::MUST::Core::SeqId>, but
truncates them to 10 characters, as expected by the original PHYLIP software
package. No other tinkering is carried out on the ids. Thus, if the ids
contain whitespace or are not unique in their 10 first characters, it is
advised to first map them using one of the constructors in
L<Bio::MUST::Core::IdMapper>.
    use aliased 'Bio::MUST::Core::Ali';
    use aliased 'Bio::MUST::Core::IdMapper';
    my $ali = Ali->load('input.ali');
    my $mapper = IdMapper->std_mapper($ali);
    $ali->shorten_ids($mapper);
    $ali->store_phylip( 'input.phy', { chunk => 50 } );
This method requires one argument and accepts a second optional argument
controlling the output format. It is a hash reference that may contain one
or more of the following keys:
    - short: truncate ids to 10 chars, as in original PHYLIP (defaut: yes)
    - clean: replace all ambiguous and missing states by 'X' (default: false)
    - chunk: line width (default: 60 chars; negative values means no wrap)
To store the Ali in PHYLIP sequential format, specify a negative chunk (-1).
=head2 load_stockholm
Class method (constructor) returning a new Ali read from a file in the
STOCKHOLM format. =GF comments are retained (see above) but not the other
comment classes (=GS, =GR and =GC).
    use aliased 'Bio::MUST::Core::Ali';
    my $ali = Ali->load('upsk.stockholm');
    say $ali->header;
    # outputs:
    # ID    UPSK
    # SE    Predicted; Infernal
    # SS    Published; PMID 9223489
    # RN    [1]
    # RM    9223489
    # RT    The role of the pseudoknot at the 3' end of turnip yellow mosaic
    # RT    virus RNA in minus-strand synthesis by the viral RNA-dependent RNA
    # RT    polymerase.
    # RA    Deiman BA, Kortlever RM, Pleij CW;
    # RL    J Virol 1997;71:5990-5996.
This method requires one argument.
=head2 load_tinyseq
Class method (constructor) returning a new Ali read from a file in NCBI
TinySeq XML format.
=head2 instant_store
Class method intended to transform a large sequence file read from disk
without loading it in memory. This method will transparently process plain
FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
    my $chunk = 200;
    my $split = sub {
        my $seq = shift;
        my $base_id = ( split /\s+/xms, $seq->full_id )[0];
        my $max_pos = $seq->seq_len - $chunk;
        my $n = 0;
        my $out_str;
        for (my $pos = 0; $pos <= $max_pos; $pos += $chunk, $n++) {
            $out_str .= ">$base_id.$n\n" . $seq->edit_seq($pos,
                $pos + $chunk <= $max_pos ? $chunk : 2 * $chunk
            ) . "\n";
        }
        return $out_str;
    };
    use aliased 'Bio::MUST::Core::Ali';
    Ali->instant_store(
        'outfile.fasta', { infile => 'infile.fasta', coderef => $split }
    );
This method requires two arguments. The sercond is a hash reference that must
contain the following keys:
    - infile:  input sequence file
    - coderef: subroutine implementing the transforming logic
=head2 instant_count
Class method returning the number of seqs in any sequence file read from disk
without loading it in memory. This method will transparently process plain
FASTA files in addition to the MUST pseudo-FASTA format (ALI files).
    use aliased 'Bio::MUST::Core::Ali';
    my $seq_n = Ali->instant_count('input.ali');
    say $seq_n;
=head1 ALIASES
=head2 height
Alias for C<count_seqs> method. For API consistency.
=head1 AUTHOR
Denis BAURAIN <denis.baurain@uliege.be>
=head1 CONTRIBUTORS
=for stopwords Catherine COLSON Arnaud DI FRANCO
=over 4
=item *
Catherine COLSON <ccolson@doct.uliege.be>
=item *
Arnaud DI FRANCO <arnaud.difranco@gmail.com>
=back
=head1 COPYRIGHT AND LICENSE
This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.
=cut
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)