Fast reference for common phylogenetic analysis tasks.
pip install phykit dendropy biopython pandas numpy scipyfrom scripts.tree_statistics import phykit_treeness, phykit_tree_length
# Treeness
treeness = phykit_treeness("tree.nwk")
print(f"Treeness: {treeness:.4f}")
# Tree length
length = phykit_tree_length("tree.nwk")
print(f"Tree length: {length:.4f}")from scripts.tree_statistics import phykit_parsimony_informative, phykit_rcv
# Parsimony informative sites
pi_count, aln_len, pi_pct = phykit_parsimony_informative("alignment.fa")
print(f"PI sites: {pi_count} / {aln_len} ({pi_pct:.2f}%)")
# RCV
rcv = phykit_rcv("alignment.fa")
print(f"RCV: {rcv:.4f}")from scripts.format_alignment import discover_gene_files
from scripts.tree_statistics import batch_treeness, batch_dvmc
import numpy as np
# Discover files
gene_files = discover_gene_files("data/")
print(f"Found {len(gene_files)} genes")
# Compute metric for all genes
treeness_results = batch_treeness(gene_files)
dvmc_results = batch_dvmc(gene_files)
# Get median
print(f"Median treeness: {np.median(list(treeness_results.values())):.4f}")
print(f"Median DVMC: {np.median(list(dvmc_results.values())):.4f}")from scipy import stats
# Get metrics for both groups
fungi_genes = discover_gene_files("data/", group_name="fungi")
animal_genes = discover_gene_files("data/", group_name="animals")
fungi_treeness = batch_treeness(fungi_genes)
animal_treeness = batch_treeness(animal_genes)
# Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(
list(fungi_treeness.values()),
list(animal_treeness.values())
)
print(f"U statistic: {u_stat:.0f}")
print(f"P-value: {p_value:.4e}")from scripts.format_alignment import filter_by_gap_threshold
from scripts.tree_statistics import batch_treeness_over_rcv
# Filter by gap percentage
valid_genes = filter_by_gap_threshold(gene_files, max_gap_pct=5.0)
print(f"Genes with <5% gaps: {len(valid_genes)}")
# Compute treeness/RCV for valid genes
results = batch_treeness_over_rcv(valid_genes)
# Extract ratios
ratios = [r[0] for r in results.values()]
print(f"Median treeness/RCV: {np.median(ratios):.4f}")# Single file
python scripts/format_alignment.py convert input.phy output.fa --format fasta
# Batch convert
python scripts/format_alignment.py batch-convert input_dir/ output_dir/ --format phylip-relaxed| Function | Input | Output |
|---|---|---|
phykit_treeness() |
tree.nwk | float (0-1) |
phykit_tree_length() |
tree.nwk | float |
phykit_evolutionary_rate() |
tree.nwk | float |
phykit_dvmc() |
tree.nwk | float |
phykit_rcv() |
alignment.fa | float |
phykit_parsimony_informative() |
alignment.fa | (count, length, pct) |
phykit_treeness_over_rcv() |
tree.nwk, alignment.fa | (ratio, treeness, rcv) |
alignment_gap_percentage() |
alignment.fa | float (0-100) |
data/
├── fungi/
│ ├── gene1.fa
│ ├── gene1.nwk
│ ├── gene2.fa
│ └── gene2.nwk
└── animals/
├── gene1.fa
├── gene1.nwk
├── gene2.fa
└── gene2.nwk
# Question: "What is the median DVMC for fungi?"
fungi_genes = discover_gene_files("data/", group_name="fungi")
fungi_dvmc = batch_dvmc(fungi_genes)
median_dvmc = np.median(list(fungi_dvmc.values()))
print(f"Median DVMC: {median_dvmc:.4f}")# Question: "What is the ratio of minimum PI sites (fungi / animals)?"
fungi_pi = batch_parsimony_informative(fungi_genes)
animal_pi = batch_parsimony_informative(animal_genes)
fungi_counts = [r[0] for r in fungi_pi.values()]
animal_counts = [r[0] for r in animal_pi.values()]
ratio = min(fungi_counts) / min(animal_counts)
print(f"Ratio: {ratio:.4f}")# Question: "What percentage of trees have treeness > 0.45?"
treeness_results = batch_treeness(gene_files)
values = list(treeness_results.values())
above = sum(1 for v in values if v > 0.45)
percentage = (above / len(values)) * 100
print(f"Percentage: {percentage:.2f}%")python scripts/format_alignment.py discover data/ --group fungipython scripts/format_alignment.py filter data/ --max-gap 5.0 --min-seqs 4python scripts/tree_statistics.py tree.nwk alignment.fapython scripts/format_alignment.py remove-gappy-seqs input.fa output.fa --max-gap 50.0python scripts/format_alignment.py remove-gappy-cols input.fa output.fa --max-gap 50.0- Check file format (FASTA must be
>name\nseq, Newick must end with;) - See
references/troubleshooting.md
- Check directory structure
- Run:
python scripts/format_alignment.py discover data/
- Check rounding (PhyKIT default: 4 decimals)
- Check PhyKIT version:
pip show phykit
- Detailed workflows: See
SKILL.md - Alignment analysis: See
references/sequence_alignment.md - Tree construction: See
references/tree_building.md - Statistical comparisons: See
references/parsimony_analysis.md - Troubleshooting: See
references/troubleshooting.md
# Core imports for most tasks
import numpy as np
import pandas as pd
from scipy import stats
# Load functions from scripts
from scripts.format_alignment import discover_gene_files
from scripts.tree_statistics import (
phykit_treeness,
phykit_tree_length,
phykit_evolutionary_rate,
phykit_dvmc,
phykit_rcv,
phykit_parsimony_informative,
phykit_treeness_over_rcv,
batch_treeness,
batch_dvmc,
batch_rcv,
summary_stats,
compare_groups
)