修改.pdb文件中的值并保存为新文件

1 投票
2 回答
85 浏览
提问于 2025-04-14 16:51

为了完成一个作业,我需要用Python读取一个小的.pdb文件,修改一些数值,然后保存为一个新的.pdb文件。
我的原始文件是这样的:

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      30.611  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      30.082  64.035  39.354  1.00  0.00           C 
END 

我尝试过用Pandas,但没有成功,因为我无法以想要的格式保存文件,而且还会多保存一个索引,我不想要这个(我用的是.to_csv('newfile.pdb'))。
我找到一个叫BioPython的模块,这是我用它的尝试:

from Bio.PDB import PDBParser, PDBIO

def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

pdb_file = "pdb1.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb)

这个方法能让我修改数值,但在保存时会多加一行不需要的内容,像这样:

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      36.111  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      35.582  64.035  39.354  1.00  0.00           C  
TER      33      SER     2
END

我该怎么做才能保存时不加那最后一行呢?

谢谢你的帮助!

2 个回答

1

好的,这是我根据你的输入做的尝试。


from Bio.PDB import PDBParser, PDBIO

import Bio

print('Biopython version : ', Bio.__version__)


def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

pdb_file = "input.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb , write_end = False , preserve_atom_numbering = True)

这样会得到:

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      36.111  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      35.582  64.035  39.354  1.00  0.00           C  
TER      25      SER     2                                                       

这是因为在 pdb_io.save(output_pdb , write_end = False , preserve_atom_numbering = True) 中使用了 write_end = False

可以查看文档 类 Bio.PDB.PDBIO.PDBIO(use_model_flag=0)

更长的代码:


from Bio.PDB import PDBParser, PDBIO, Select

import Bio

from Bio.PDB.PDBExceptions import PDBIOException

print('Biopython version : ', Bio.__version__)

class PDBIO(PDBIO):
    
    def save(self, file, select = Select(), write_end=True, preserve_atom_numbering=False):
        """Save structure to a file.

        :param file: output file
        :type file: string or filehandle

        :param select: selects which entities will be written.
        :type select: object

        Typically select is a subclass of L{Select}, it should
        have the following methods:

         - accept_model(model)
         - accept_chain(chain)
         - accept_residue(residue)
         - accept_atom(atom)

        These methods should return 1 if the entity is to be
        written out, 0 otherwise.

        Typically select is a subclass of L{Select}.
        """
        if isinstance(file, str):
            fhandle = open(file, "w")
        else:
            # filehandle, I hope :-)
            fhandle = file

        get_atom_line = self._get_atom_line

        # multiple models?
        if len(self.structure) > 1 or self.use_model_flag:
            model_flag = 1
        else:
            model_flag = 0

        for model in self.structure.get_list():
            if not select.accept_model(model):
                continue
            # necessary for ENDMDL
            # do not write ENDMDL if no residues were written
            # for this model
            model_residues_written = 0
            if not preserve_atom_numbering:
                atom_number = 1
            if model_flag:
                fhandle.write(f"MODEL      {model.serial_num}\n")

            for chain in model.get_list():
                if not select.accept_chain(chain):
                    continue
                chain_id = chain.id
                if len(chain_id) > 1:
                    e = f"Chain id ('{chain_id}') exceeds PDB format limit."
                    raise PDBIOException(e)

                # necessary for TER
                # do not write TER if no residues were written
                # for this chain
                chain_residues_written = 0

                for residue in chain.get_unpacked_list():
                    if not select.accept_residue(residue):
                        continue
                    hetfield, resseq, icode = residue.id
                    resname = residue.resname
                    segid = residue.segid
                    resid = residue.id[1]
                    if resid > 9999:
                        e = f"Residue number ('{resid}') exceeds PDB format limit."
                        raise PDBIOException(e)

                    for atom in residue.get_unpacked_list():
                        if not select.accept_atom(atom):
                            continue
                        chain_residues_written = 1
                        model_residues_written = 1
                        if preserve_atom_numbering:
                            atom_number = atom.serial_number

                        try:
                            s = get_atom_line(
                                atom,
                                hetfield,
                                segid,
                                atom_number,
                                resname,
                                resseq,
                                icode,
                                chain_id,
                            )
                        except Exception as err:
                            # catch and re-raise with more information
                            raise PDBIOException(
                                f"Error when writing atom {atom.full_id}"
                            ) from err
                        else:
                            fhandle.write(s)
                            # inconsequential if preserve_atom_numbering is True
                            atom_number += 1

                # if chain_residues_written:
                #     fhandle.write(
                #         _TER_FORMAT_STRING
                #         % (atom_number, resname, chain_id, resseq, icode)
                #     )

            if model_flag and model_residues_written:
                fhandle.write("ENDMDL\n")
        if write_end:
            fhandle.write("END   \n")

        if isinstance(file, str):
            fhandle.close()


def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

pdb_file = "input.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb , write_end = False ,, preserve_atom_numbering = True)

这样会得到:

ATOM     19  HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H  
ATOM     20  C   TYR     1      29.796  62.354  41.246  1.00  0.00           C  
ATOM     23  H   SER     2      36.111  61.950  39.410  1.00  0.00           H  
ATOM     24  CA  SER     2      35.582  64.035  39.354  1.00  0.00           C  

因为我对 PDBIO 进行了修改,具体是 类 PDBIO(StructureIO) 中删除了:

# if chain_residues_written:
#     fhandle.write(
#         _TER_FORMAT_STRING
#         % (atom_number, resname, chain_id, resseq, icode)
#     )

不过需要导入: from Bio.PDB.PDBExceptions import PDBIOException

附带说明:

你的原始代码是错误的,缺少了: preserve_atom_numbering = True

pdb_io.save(output_pdb) 中,至少在最后一个稳定版本的 Biopython 中是这样。

0

我新增了一个叫做 save_without_ter_end 的函数,用来保存修改后的结构,但不包含 TER 和 END 这两行。这个 save_without_ter_end 函数会手动构建 PDB 文件的每一行,确保只写入 ATOM 记录,并在最后加上 END 行。

我去掉了 PDBIO 的使用,因为它不能满足我想要的效果,无法排除 TER 和 END 行。

from Bio.PDB import PDBParser, PDBIO

def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

def save_without_ter_end(structure, filename):
    with open(filename, 'w') as pdb_file:
        for model in structure:
            for chain in model:
                for residue in chain:
                    for atom in residue:
                        atom_line = f"ATOM  {atom.serial_number:<5d} {atom.get_name():>4s} {residue.resname:>3s} {chain.id}{residue.id[1]:>4d}    {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f}          {atom.element:>2s}\n"
                        pdb_file.write(atom_line)
        pdb_file.write("END\n")


pdb_file = "pdb1.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file without TER and END lines
output_pdb = "modified_pdb1.pdb"
save_without_ter_end(structure, output_pdb)


ATOM  19     HD2 TYR     1      26.910  61.717  39.871  1.00  0.00           H
ATOM  20       C TYR     1      29.796  62.354  41.246  1.00  0.00           C
ATOM  23       H SER     2      36.111  61.950  39.410  1.00  0.00           H
ATOM  24      CA SER     2      35.582  64.035  39.354  1.00  0.00           C
END

或者你可以直接在你的命令中加上这个代码。这个代码会遍历结构中的每个原子,手动写入 PDB 文件,而不包括 TER 和 END 行。

# Manually write the PDB file
with open(output_pdb, 'w') as pdb_file:
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_line = f"{atom.get_id():6s}{atom.get_name():5s}{residue.resname:>4s} {chain.id:4s}{residue.id[1]:>4d}    {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f}          {atom.element:>2s}\n"
                    pdb_file.write(atom_line)

或者你也可以通过以下方式实现相同的效果。

import pandas as pd
from Bio.PDB import PDBParser, PDBIO
# Read the original PDB file into a DataFrame
pdb_file = "pdb1.pdb"
columns = ["record_name", "atom_number", "atom_name", "residue_name", "chain_id", "residue_number", "x", "y", "z", "occupancy", "b_factor", "element_symbol", "charge"]
df = pd.read_csv(pdb_file, delim_whitespace=True, names=columns)

def translate_atoms(structure, resname, translation_vector):
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.resname == resname:
                    for atom in residue:
                        atom.coord += translation_vector

# Read the original PDB file
pdb_file = "pdb1.pdb"
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Define translation vector
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)

# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)


with open(output_pdb, 'w') as pdb_file:
    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_line = f"ATOM  {atom.serial_number:>4d} {atom.element:>2s} {residue.resname:>4s} {chain.id:4s}{residue.id[1]:>4d}    {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f}          {atom.element:>2s}\n"
                    pdb_file.write(atom_line)
    pdb_file.write("END\n")

print("Modified PDB file saved successfully.")

或者

from Bio.PDB import PDBParser, PDBIO
import pandas as pd

# Read the original PDB file
pdb_file = "pdb1.pdb"
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)

# Create an empty DataFrame to store the atom information
atom_data = []

# Iterate over the structure and populate the DataFrame with atom information
for model in structure:
    for chain in model:
        for residue in chain:
            for atom in residue:
                atom_data.append([
                    atom.serial_number,
                    atom.element,
                    residue.resname,
                    chain.id,
                    residue.id[1],
                    atom.coord[0],
                    atom.coord[1],
                    atom.coord[2],
                    atom.occupancy,
                    atom.bfactor,
                    atom.element
                ])

# Create a DataFrame from the collected atom data
columns = ["atom_number", "atom_name", "residue_name", "chain_id", "residue_number", "x", "y", "z", "occupancy", "b_factor", "charge"]
df = pd.DataFrame(atom_data, columns=columns)
# Define translation vector
translation_vector = [5.5, 0, 0]

# Translate all atoms of SER residue in x direction
ser_mask = df["residue_name"] == "SER"
df.loc[ser_mask, ["x", "y", "z"]] += translation_vector

# Save the modified DataFrame to a new PDB file
output_pdb = "modified_pdb1.pdb"
output_pdb = "modified_pdb1.pdb"
# Save the DataFrame to a new PDB file
output_pdb = "modified_pdb1.pdb"
with open(output_pdb, 'w') as pdb_file:
    for index, row in df.iterrows():
        atom_line = f"ATOM  {row['atom_number']:>4d} {row['atom_name']:>2s} {row['residue_name']:>4s} {row['chain_id']:4s}{row['residue_number']:>4d}    {row['x']:8.3f}{row['y']:8.3f}{row['z']:8.3f}{row['occupancy']:6.2f}{row['b_factor']:6.2f}          {row['charge']:>2s}\n"
        pdb_file.write(atom_line)
    pdb_file.write("END\n")

print("Modified PDB file saved successfully.")

撰写回答