scripts/cutadapt_polyA_algo_timing.py

#!/usr/bin/env python3
import time
import random
import statistics
from typing import List

## set the seed
random.seed(1234)
def generate_string(length: int) -> str:
    first_part = ''.join(random.choice('ACTG') for _ in range(int(0.8 * length)))
    second_part = 'A' * (length - len(first_part))
    return first_part + second_part

def benchmark(s: str) -> float:
    start_time = time.time()
    n = len(s)
    best_index = n
    best_score = score = errors = 0
    for i, nuc in reversed(list(enumerate(s))):
        if nuc == "A":
            score += 1
        else:
            score -= 2
            errors += 1
        if score > best_score and errors <= 0.2 * (n - i):
            best_index = i
            best_score = score
    s = s[:best_index]
    end_time = time.time()
    return end_time - start_time

def benchmark_nonerror(s: str) -> float:
    start_time = time.time()
    n = len(s)
    best_index = n
    best_score = score = 0
    for i, nuc in reversed(list(enumerate(s))):
        if nuc == "A":
            score += 1
        else:
            score -= 2
        if score > best_score:
            best_index = i
            best_score = score
    if best_score < - 0.4 * ( best_index + 1 ) :
        best_index = n
    s = s[:best_index]
    end_time = time.time()
    return end_time - start_time



lengths = [ 100,  1000, 2000,  10000]
repetitions = 2000

print("-" * 80)
print(f"Benchmarking with original cutadapt algorithm")
print("-" * 80)

for length in lengths:
    times = []
    for _ in range(repetitions):
        s = generate_string(length)
        time_taken = benchmark(s)
        times.append(time_taken)
    min_time = min(times)
    max_time = max(times)
    print(f"\nStatistics for string of length {length}:")
    print(f"Mean time: {1000000*statistics.mean(times):.2e} microseconds")
    print(f"Standard deviation: {1000000*statistics.stdev(times):.2e} microseconds")
    print(f"Median time: {1000000*statistics.median(times):.2e} microseconds")
    print(f"Min time: {1000000*min_time:.2e} microseconds")
    print(f"Max time: {1000000*max_time:.2e} microseconds")

print("-" * 80)
print(f"Benchmarking with modified cutadapt algorithm")
print("-" * 80)
for length in lengths:
    times = []
    for _ in range(repetitions):
        s = generate_string(length)
        time_taken = benchmark_nonerror(s)
        times.append(time_taken)
    min_time = min(times)
    max_time = max(times)
    print(f"\nStatistics for string of length {length}:")
    print(f"Mean time: {1000000*statistics.mean(times):.2e} microseconds")
    print(f"Standard deviation: {1000000*statistics.stdev(times):.2e} microseconds")
    print(f"Median time: {1000000*statistics.median(times):.2e} microseconds")
    print(f"Min time: {1000000*min_time:.2e} microseconds")
    print(f"Max time: {1000000*max_time:.2e} microseconds")

	Global
`s`	Focus search bar
`?`	Bring up this help dialog

	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)

	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse

	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)