Monday, November 17, 2025

Convert FASTA and PHYLIP Alignment Files Automatically Using Python

Working with evolutionary biology software like HyPhy and PAML often requires converting sequence alignments between different formats. Two of the most common alignment file formats are FASTA and PHYLIP. Unfortunately, switching between them manually can be tedious, especially when dealing with large datasets or multiple files.

To simplify this process, here's a Python script that can automatically detect the input format (FASTA or PHYLIP) and convert it in either direction — FASTA → PHYLIP or PHYLIP → FASTA — in a single step.


✨ Features

  • Automatic format detection — no need to specify the input type.
  • Supports both strict (PAML) and relaxed (HyPhy) PHYLIP formats.
  • Handles interleaved and sequential PHYLIP layouts.
  • Preserves sequence order and names.
  • Pure Python — no external dependencies required.

📜 The Python Script

Copy and save the following code as convert_alignment.py. It will work with Python 3.

#!/usr/bin/env python3
"""
Convert between FASTA and PHYLIP alignment formats automatically.

Usage:
    python convert_alignment.py input_file output_file [--relaxed]

Description:
    Automatically detects the input format:
      - FASTA (.fasta, .fa, .aln, .afa) → PHYLIP (.phy)
      - PHYLIP (.phy, .phylip) → FASTA (.fasta)

Options:
    --relaxed   Use relaxed PHYLIP format (long names, for HyPhy)
"""

import sys
import os
import re

# ------------------------
# FASTA → PHYLIP conversion
# ------------------------

def read_fasta(filename):
    """Read a FASTA file and return {name: sequence}."""
    sequences = {}
    with open(filename) as f:
        name, seq = None, []
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line.startswith(">"):
                if name:
                    sequences[name] = "".join(seq)
                name = line[1:].split()[0]
                seq = []
            else:
                seq.append(line.replace(" ", ""))
        if name:
            sequences[name] = "".join(seq)
    return sequences


def write_phylip(sequences, outfile, relaxed=False):
    """Write sequences to PHYLIP format (strict or relaxed)."""
    names = list(sequences.keys())
    seqs = list(sequences.values())
    nseq, length = len(seqs), len(seqs[0])

    for name, seq in sequences.items():
        if len(seq) != length:
            raise ValueError(f"Error: sequence '{name}' length {len(seq)} != expected {length}")

    with open(outfile, "w") as out:
        out.write(f"{nseq} {length}\n")
        for name, seq in sequences.items():
            if relaxed:
                out.write(f"{name.ljust(15)} {seq}\n")
            else:
                out.write(f"{name[:10].ljust(10)} {seq}\n")


# ------------------------
# PHYLIP → FASTA conversion
# ------------------------

def read_phylip(filename):
    """Read PHYLIP (strict or relaxed, sequential or interleaved)."""
    with open(filename) as f:
        lines = [line.rstrip() for line in f if line.strip()]

    header = lines[0]
    parts = header.split()
    if len(parts) < 2:
        raise ValueError("Invalid PHYLIP header line")
    nseq, nsites = map(int, parts[:2])
    lines = lines[1:]

    seq_dict, seq_order = {}, []
    name_pattern = re.compile(r"^(\S+)\s+([A-Za-z\-?]+)$")

    for line in lines:
        match = name_pattern.match(line)
        if match:
            name, seq = match.groups()
            seq_dict[name] = seq
            seq_order.append(name)
        else:
            break

    if len(seq_dict) < nseq:
        seq_dict = {name: "" for name in seq_order}
        block_lines = lines.copy()
        while block_lines:
            for name in seq_order:
                if not block_lines:
                    break
                line = block_lines.pop(0)
                if not line.strip():
                    continue
                parts = line.split()
                if len(parts) == 1:
                    seq = parts[0]
                else:
                    seq = parts[1]
                seq_dict[name] += seq

    for name, seq in seq_dict.items():
        if len(seq) != nsites:
            print(f"Warning: sequence '{name}' length {len(seq)} != expected {nsites}")

    return seq_dict


def write_fasta(seq_dict, outfile):
    """Write sequences in FASTA format."""
    with open(outfile, "w") as out:
        for name, seq in seq_dict.items():
            out.write(f">{name}\n")
            for i in range(0, len(seq), 60):
                out.write(seq[i:i+60] + "\n")


# ------------------------
# Auto-detection and conversion
# ------------------------

def detect_format(infile):
    """Guess file format based on extension or content."""
    ext = os.path.splitext(infile)[1].lower()
    if ext in [".fasta", ".fa", ".aln", ".afa"]:
        return "fasta"
    elif ext in [".phy", ".phylip"]:
        return "phylip"

    with open(infile) as f:
        first = f.readline()
        if first.startswith(">"):
            return "fasta"
        elif re.match(r"^\s*\d+\s+\d+", first):
            return "phylip"
    raise ValueError("Could not detect input format (expected FASTA or PHYLIP).")


def main():
    if len(sys.argv) < 3:
        print(__doc__)
        sys.exit(1)

    infile = sys.argv[1]
    outfile = sys.argv[2]
    relaxed = "--relaxed" in sys.argv

    fmt = detect_format(infile)

    if fmt == "fasta":
        seqs = read_fasta(infile)
        write_phylip(seqs, outfile, relaxed=relaxed)
        print(f"Converted FASTA → PHYLIP ({'relaxed' if relaxed else 'strict'}) → {outfile}")
    elif fmt == "phylip":
        seqs = read_phylip(infile)
        write_fasta(seqs, outfile)
        print(f"Converted PHYLIP → FASTA → {outfile}")
    else:
        raise ValueError("Unrecognized format.")


if __name__ == "__main__":
    main()

🧪 How to Use

Open your terminal or command prompt in the directory where you saved the script and run:

python convert_alignment.py input_alignment.fasta output_alignment.phy

This will automatically detect that the input is in FASTA format and convert it to PHYLIP format suitable for PAML.

To generate a HyPhy-compatible relaxed PHYLIP file (with longer sequence names):

python convert_alignment.py input.aln output.phy --relaxed

And to convert back from PHYLIP to FASTA:

python convert_alignment.py input.phy output.fasta

💡 Behind the Scenes

  • Automatic detection looks at the file extension and the first line of the file.
  • FASTA files are recognized by the > symbol at the start of each sequence.
  • PHYLIP files are recognized by the numeric header line (e.g. 5 1200 for 5 sequences, 1200 sites).
  • The script supports both strict (10-character names) for PAML and relaxed (long names) for HyPhy.

🧭 Why This Matters

Researchers in molecular evolution, phylogenetics, and comparative genomics often need to move seamlessly between tools like HyPhy, PAML, PhyML, and MEGA. Each expects different input file conventions — this script saves time and avoids errors in manual reformatting.


🚀 Summary

  • Fully automated alignment format converter.
  • Compatible with all major phylogenetic tools.
  • Ideal for bioinformatics pipelines or teaching labs.

Whether you're preparing alignments for selection analysis in PAML or rate variation modeling in HyPhy, this script keeps your workflow smooth and reproducible.



No comments: