#!c:\Perl\bin\perl.exe -w
use
Carp
qw (cluck
croak carp) ;
my
$ProgramName
= basename($0) ;
my
$OptionHelp
=
undef
;
my
$VerboseLevel
=
undef
;
my
$Database
=
'MTHFRAG'
;
my
$Input
=
'../MS_fragments-adducts-isotopes__V1.1.txt'
;
my
$databaseFile
=
'../Knapsack__V1_1.csv'
;
my
$databaseNewFile
=
'./tmp/Knapsack.new'
;
&GetOptions
(
"h|help"
=> \
$OptionHelp
,
"v|verbose=i"
=> \
$VerboseLevel
,
"database=s"
=> \
$Database
,
"input|i=s"
=> \
$Input
,
) ;
if
(
defined
(
$OptionHelp
) ){
&help
; }
if
( (
defined
$Database
) and (
$Database
eq
'KNAPSACK'
) ) {
&updateKnapsack
(
$databaseFile
) ;
}
elsif
( (
defined
$Database
) and (
$Database
eq
'MTHFRAG'
) ) {
&validateMTHFragmentList
(
$Input
) ;
}
sub
validateMTHFragmentList {
my
(
$source
) =
@_
;
my
%fragment
= (
_TYPE_
=>
'type'
,
_DELTA_MASS_
=>
'delta_mass'
,
_LOSSES_OR_GAINS_
=>
'losses_or_gains'
,
_ANNOTATION_IN_POS_MODE_
=>
'annotation_in_pos_mode'
,
_ANNOTATION_IN_NEG_MODE_
=>
'annotation_in_neg_mode'
,
) ;
my
$entriesNb
= 0 ;
my
$checker
= 0 ;
my
@fragments
= () ;
print
"##\n## * * * VALIDATION OF $source * * * ##\n##\n"
;
my
$csv
= Text::CSV->new ( {
'sep_char'
=>
"\t"
,
binary
=> 1,
auto_diag
=> 1,
eol
=>
"\n"
} )
or
die
"Cannot use CSV: "
.Text::CSV->error_diag ();
open
my
$fh
,
"<"
,
$source
or
die
"$source: $!"
;
$csv
->header (
$fh
, {
munge_column_names
=>
sub
{
s/\s+$//;
s/^\s+//;
my
$uc_col
=
'_'
.
uc
$_
.
'_'
;
if
(
$_
ne
'example_valine'
) {
$fragment
{
$uc_col
} or
die
"Unknown column '$_' in $source"
;
}
}});
while
(
my
$row
=
$csv
->getline_hr (
$fh
)) {
my
%currentFrag
=
%fragment
;
print
"[INFO] START checking line $entriesNb...\t\t"
;
if
( (
$row
->{
'type'
} ) and ( (
$row
->{
'type'
} eq
'adduct'
) or (
$row
->{
'type'
} eq
'isotope'
) or (
$row
->{
'type'
} eq
'fragment'
) or (
$row
->{
'type'
} eq
'pseudomolecular ion'
) ) ) {
$currentFrag
{_TYPE_} =
$row
->{
'type'
} ;
$checker
++ ;
}
else
{
warn
"\t[WARN] The type for line $entriesNb is undef or unknown ($row->{'type'}) \n"
;
$currentFrag
{_TYPE_} =
undef
;
}
if
( (
$row
->{
'delta_mass'
} ) and (
$row
->{
'delta_mass'
} > 0 or
$row
->{
'delta_mass'
} < 0) ) {
$currentFrag
{_DELTA_MASS_} =
$row
->{
'delta_mass'
} ;
$checker
++ ;
}
else
{
warn
"\t[WARN] The delta_mass for line $entriesNb is undef or equal to 0\n"
;
$currentFrag
{_DELTA_MASS_} =
undef
;
}
if
( (
$row
->{
'losses_or_gains'
} ) and (
$row
->{
'losses_or_gains'
} ne
''
) ) {
$currentFrag
{_LOSSES_OR_GAINS_} =
$row
->{
'losses_or_gains'
} ;
$checker
++ ;
}
else
{
warn
"\t[WARN] The losses or gains for line $entriesNb is undef or void\n"
;
$currentFrag
{_LOSSES_OR_GAINS_} =
undef
;
}
$currentFrag
{_ANNOTATION_IN_POS_MODE_} =
$row
->{
'annotation_in_pos_mode'
} ;
$currentFrag
{_ANNOTATION_IN_NEG_MODE_} =
$row
->{
'annotation_in_neg_mode'
} ;
my
%tmp
=
%currentFrag
;
push
(
@fragments
, \
%tmp
) ;
if
(
$checker
== 3 ) {
print
" line is OK\n"
; }
if
(
$checker
< 3 ) {
print
"\n"
; }
$checker
= 0 ;
$entriesNb
++ ;
}
print
Dumper
@fragments
;
return
(0) ;
}
sub
updateKnapsack {
my
(
$dbFile
, ) =
@_
;
my
(
$lastID
,
$nbEntries
) = ( 0, 0 ) ;
print
"[INFO] Parsing Knapsack current version...\n"
;
my
$csv
= Text::CSV->new ( {
'sep_char'
=>
","
,
binary
=> 1 } )
or
die
"Cannot use CSV: "
.Text::CSV->error_diag ();
open
my
$fh
,
"<:encoding(utf8)"
,
$dbFile
or
die
"Can't open csv file $dbFile: $!"
;
while
(
my
$row
=
$csv
->getline(
$fh
) ) {
if
(
$row
->[0] eq
'knapsackid'
) {
next
;
}
if
(
$row
->[0] =~ /^C([0-9]*)/ ) {
$nbEntries
++ ;
my
$id
= $1 ;
if
( (
defined
$id
) and (
$id
>
$lastID
) ) {
$lastID
=
$id
;
}
else
{
next
;
}
}
}
$csv
->
eof
or
$csv
->error_diag();
close
$fh
;
print
"[INFO] Knapsack db parsed with: $nbEntries entries\n"
;
print
"[INFO] Last Knapsack id in exported db is: $lastID\n"
;
my
$tryAgain
=
'TRUE'
;
my
(
$runNbTrue
,
$runNbFalse
) = (0, 0) ;
my
$thresholdFalse
= 10 ;
my
$thresholdTrue
= 1000 ;
my
%NewKnapSackDump
= () ;
my
$newId
=
$lastID
;
while
(
$tryAgain
eq
'TRUE'
) {
$newId
=
$newId
+1 ;
my
$newFormattedId
=
'C'
.
sprintf
'%08s'
,
$newId
;
print
"[INFO] Trying to find new data with Knapsack id: $newFormattedId\n"
;
if
(
defined
$newFormattedId
) {
my
$ua
= LWP::UserAgent->new;
my
$results
=
$ua
->get(
"$url"
);
if
(
$results
->content =~/<font class=er>Input key word error!! <br>/) {
print
"[WARN] This ID ($newFormattedId) doesn't exist in knapsack db today\n"
;
$runNbFalse
++ ;
if
(
$runNbFalse
>=
$thresholdFalse
) {
$tryAgain
=
'FALSE'
;
}
}
else
{
$runNbTrue
++ ;
$runNbFalse
= 0 ;
my
$id
=
undef
;
if
(
$results
->content =~/<title>KNApSAcK Metabolite Information - (.*)<\/title>/) {
$NewKnapSackDump
{$1}{
'knapsackid'
} = $1 ;
$id
= $1 ;
}
if
(
$results
->content =~/<th class=
"inf"
>Name<\/th>\n\s+<td colspan=
"4"
class=
"inf"
>(.*)<\/td>/) {
my
$tempname
= $1 ;
if
(
$tempname
=~/<br>/) {
my
@names
=
split
(/<br>/,
$tempname
) ;
$NewKnapSackDump
{
$id
}{
'name'
} =
$names
[0] ;
}
else
{
$NewKnapSackDump
{
$id
}{
'name'
} =
$tempname
;
}
}
if
(
$results
->content =~/<th class=
"inf"
>Formula<\/th>\n\s+<td colspan=
"4"
>(.*)<\/td>/) {
$NewKnapSackDump
{
$id
}{
'formula'
} = $1 ;
}
if
(
$results
->content =~/<th class=
"inf"
>Mw<\/th>\n\s+<td colspan=
"4"
>(.*)<\/td>/) {
$NewKnapSackDump
{
$id
}{
'mw'
} = $1 ;
}
if
(
$results
->content =~/<th class=
"inf"
>CAS RN<\/th>\n\s+<td colspan=
"4"
>(.*)<\/td>/) {
$NewKnapSackDump
{
$id
}{
'cas'
} = $1 ;
}
if
(
$results
->content =~/<th class=
"inf"
>InChIKey<\/th>\n\s+<td colspan=
"4"
>(.*)<\/td>/) {
$NewKnapSackDump
{
$id
}{
'inchikey'
} = $1 ;
}
if
(
$results
->content =~/<th class=
"inf"
>InChICode<\/th>\n\s+<td colspan=
"4"
>(.*)<\/td>/) {
$NewKnapSackDump
{
$id
}{
'inchi'
} = $1 ;
}
if
(
$runNbTrue
>=
$thresholdTrue
) {
$tryAgain
=
'FALSE'
;
}
}
}
}
open
(CSV,
'>:utf8'
,
$databaseNewFile
) or
die
"Cant' create the file $databaseNewFile\n"
;
foreach
my
$id
(
sort
keys
%NewKnapSackDump
) {
if
( (
$NewKnapSackDump
{
$id
}{
'name'
} ) and (
$NewKnapSackDump
{
$id
}{
'name'
} ne
''
) and (
$NewKnapSackDump
{
$id
}{
'mw'
} ) and (
$NewKnapSackDump
{
$id
}{
'mw'
} > 0 ) ) {
print
"$id,\"$NewKnapSackDump{$id}{'name'}\",$NewKnapSackDump{$id}{'formula'},$NewKnapSackDump{$id}{'mw'},$NewKnapSackDump{$id}{'cas'},$NewKnapSackDump{$id}{'inchikey'}\n"
;
print
CSV
"$id,\"$NewKnapSackDump{$id}{'name'}\",$NewKnapSackDump{$id}{'formula'},$NewKnapSackDump{$id}{'mw'},$NewKnapSackDump{$id}{'cas'},$NewKnapSackDump{$id}{'inchikey'}\n"
;
}
}
close
(CSV) ;
return
() ;
}
sub
help {
print
STDERR
<<EOF ;
### $ProgramName ###
#
# AUTHOR: Franck Giacomoni
# VERSION: 1.0
# CREATED: 2020/04/24
# LAST MODIF:
# PURPOSE:
# USAGE: $ProgramName or $ProgramName -o options
EOF
exit
(1) ;
}