修改.pdb文件中的值并保存为新文件
为了完成一个作业,我需要用Python读取一个小的.pdb文件,修改一些数值,然后保存为一个新的.pdb文件。
我的原始文件是这样的:
ATOM 19 HD2 TYR 1 26.910 61.717 39.871 1.00 0.00 H
ATOM 20 C TYR 1 29.796 62.354 41.246 1.00 0.00 C
ATOM 23 H SER 2 30.611 61.950 39.410 1.00 0.00 H
ATOM 24 CA SER 2 30.082 64.035 39.354 1.00 0.00 C
END
我尝试过用Pandas,但没有成功,因为我无法以想要的格式保存文件,而且还会多保存一个索引,我不想要这个(我用的是.to_csv('newfile.pdb')
)。
我找到一个叫BioPython的模块,这是我用它的尝试:
from Bio.PDB import PDBParser, PDBIO
def translate_atoms(structure, resname, translation_vector):
for model in structure:
for chain in model:
for residue in chain:
if residue.resname == resname:
for atom in residue:
atom.coord += translation_vector
pdb_file = "pdb1.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)
# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]
# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)
# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb)
这个方法能让我修改数值,但在保存时会多加一行不需要的内容,像这样:
ATOM 19 HD2 TYR 1 26.910 61.717 39.871 1.00 0.00 H
ATOM 20 C TYR 1 29.796 62.354 41.246 1.00 0.00 C
ATOM 23 H SER 2 36.111 61.950 39.410 1.00 0.00 H
ATOM 24 CA SER 2 35.582 64.035 39.354 1.00 0.00 C
TER 33 SER 2
END
我该怎么做才能保存时不加那最后一行呢?
谢谢你的帮助!
2 个回答
好的,这是我根据你的输入做的尝试。
from Bio.PDB import PDBParser, PDBIO
import Bio
print('Biopython version : ', Bio.__version__)
def translate_atoms(structure, resname, translation_vector):
for model in structure:
for chain in model:
for residue in chain:
if residue.resname == resname:
for atom in residue:
atom.coord += translation_vector
pdb_file = "input.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)
# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]
# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)
# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb , write_end = False , preserve_atom_numbering = True)
这样会得到:
ATOM 19 HD2 TYR 1 26.910 61.717 39.871 1.00 0.00 H
ATOM 20 C TYR 1 29.796 62.354 41.246 1.00 0.00 C
ATOM 23 H SER 2 36.111 61.950 39.410 1.00 0.00 H
ATOM 24 CA SER 2 35.582 64.035 39.354 1.00 0.00 C
TER 25 SER 2
这是因为在 pdb_io.save(output_pdb , write_end = False , preserve_atom_numbering = True)
中使用了 write_end = False
。
可以查看文档 类 Bio.PDB.PDBIO.PDBIO(use_model_flag=0)
更长的代码:
from Bio.PDB import PDBParser, PDBIO, Select
import Bio
from Bio.PDB.PDBExceptions import PDBIOException
print('Biopython version : ', Bio.__version__)
class PDBIO(PDBIO):
def save(self, file, select = Select(), write_end=True, preserve_atom_numbering=False):
"""Save structure to a file.
:param file: output file
:type file: string or filehandle
:param select: selects which entities will be written.
:type select: object
Typically select is a subclass of L{Select}, it should
have the following methods:
- accept_model(model)
- accept_chain(chain)
- accept_residue(residue)
- accept_atom(atom)
These methods should return 1 if the entity is to be
written out, 0 otherwise.
Typically select is a subclass of L{Select}.
"""
if isinstance(file, str):
fhandle = open(file, "w")
else:
# filehandle, I hope :-)
fhandle = file
get_atom_line = self._get_atom_line
# multiple models?
if len(self.structure) > 1 or self.use_model_flag:
model_flag = 1
else:
model_flag = 0
for model in self.structure.get_list():
if not select.accept_model(model):
continue
# necessary for ENDMDL
# do not write ENDMDL if no residues were written
# for this model
model_residues_written = 0
if not preserve_atom_numbering:
atom_number = 1
if model_flag:
fhandle.write(f"MODEL {model.serial_num}\n")
for chain in model.get_list():
if not select.accept_chain(chain):
continue
chain_id = chain.id
if len(chain_id) > 1:
e = f"Chain id ('{chain_id}') exceeds PDB format limit."
raise PDBIOException(e)
# necessary for TER
# do not write TER if no residues were written
# for this chain
chain_residues_written = 0
for residue in chain.get_unpacked_list():
if not select.accept_residue(residue):
continue
hetfield, resseq, icode = residue.id
resname = residue.resname
segid = residue.segid
resid = residue.id[1]
if resid > 9999:
e = f"Residue number ('{resid}') exceeds PDB format limit."
raise PDBIOException(e)
for atom in residue.get_unpacked_list():
if not select.accept_atom(atom):
continue
chain_residues_written = 1
model_residues_written = 1
if preserve_atom_numbering:
atom_number = atom.serial_number
try:
s = get_atom_line(
atom,
hetfield,
segid,
atom_number,
resname,
resseq,
icode,
chain_id,
)
except Exception as err:
# catch and re-raise with more information
raise PDBIOException(
f"Error when writing atom {atom.full_id}"
) from err
else:
fhandle.write(s)
# inconsequential if preserve_atom_numbering is True
atom_number += 1
# if chain_residues_written:
# fhandle.write(
# _TER_FORMAT_STRING
# % (atom_number, resname, chain_id, resseq, icode)
# )
if model_flag and model_residues_written:
fhandle.write("ENDMDL\n")
if write_end:
fhandle.write("END \n")
if isinstance(file, str):
fhandle.close()
def translate_atoms(structure, resname, translation_vector):
for model in structure:
for chain in model:
for residue in chain:
if residue.resname == resname:
for atom in residue:
atom.coord += translation_vector
pdb_file = "input.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)
# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]
# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)
# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
pdb_io.save(output_pdb , write_end = False ,, preserve_atom_numbering = True)
这样会得到:
ATOM 19 HD2 TYR 1 26.910 61.717 39.871 1.00 0.00 H
ATOM 20 C TYR 1 29.796 62.354 41.246 1.00 0.00 C
ATOM 23 H SER 2 36.111 61.950 39.410 1.00 0.00 H
ATOM 24 CA SER 2 35.582 64.035 39.354 1.00 0.00 C
因为我对 PDBIO
进行了修改,具体是 类 PDBIO(StructureIO) 中删除了:
# if chain_residues_written:
# fhandle.write(
# _TER_FORMAT_STRING
# % (atom_number, resname, chain_id, resseq, icode)
# )
不过需要导入: from Bio.PDB.PDBExceptions import PDBIOException
附带说明:
你的原始代码是错误的,缺少了: preserve_atom_numbering = True
,
在 pdb_io.save(output_pdb)
中,至少在最后一个稳定版本的 Biopython 中是这样。
我新增了一个叫做 save_without_ter_end 的函数,用来保存修改后的结构,但不包含 TER 和 END 这两行。这个 save_without_ter_end 函数会手动构建 PDB 文件的每一行,确保只写入 ATOM 记录,并在最后加上 END 行。
我去掉了 PDBIO 的使用,因为它不能满足我想要的效果,无法排除 TER 和 END 行。
from Bio.PDB import PDBParser, PDBIO
def translate_atoms(structure, resname, translation_vector):
for model in structure:
for chain in model:
for residue in chain:
if residue.resname == resname:
for atom in residue:
atom.coord += translation_vector
def save_without_ter_end(structure, filename):
with open(filename, 'w') as pdb_file:
for model in structure:
for chain in model:
for residue in chain:
for atom in residue:
atom_line = f"ATOM {atom.serial_number:<5d} {atom.get_name():>4s} {residue.resname:>3s} {chain.id}{residue.id[1]:>4d} {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f} {atom.element:>2s}\n"
pdb_file.write(atom_line)
pdb_file.write("END\n")
pdb_file = "pdb1.pdb"
# Read the PDB file
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)
# Translation vector for x direction (0.55 nm, so 5.5Å)
translation_vector = [5.5, 0, 0]
# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)
# Write the modified structure to a new PDB file without TER and END lines
output_pdb = "modified_pdb1.pdb"
save_without_ter_end(structure, output_pdb)
ATOM 19 HD2 TYR 1 26.910 61.717 39.871 1.00 0.00 H
ATOM 20 C TYR 1 29.796 62.354 41.246 1.00 0.00 C
ATOM 23 H SER 2 36.111 61.950 39.410 1.00 0.00 H
ATOM 24 CA SER 2 35.582 64.035 39.354 1.00 0.00 C
END
或者你可以直接在你的命令中加上这个代码。这个代码会遍历结构中的每个原子,手动写入 PDB 文件,而不包括 TER 和 END 行。
# Manually write the PDB file
with open(output_pdb, 'w') as pdb_file:
for model in structure:
for chain in model:
for residue in chain:
for atom in residue:
atom_line = f"{atom.get_id():6s}{atom.get_name():5s}{residue.resname:>4s} {chain.id:4s}{residue.id[1]:>4d} {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f} {atom.element:>2s}\n"
pdb_file.write(atom_line)
或者你也可以通过以下方式实现相同的效果。
import pandas as pd
from Bio.PDB import PDBParser, PDBIO
# Read the original PDB file into a DataFrame
pdb_file = "pdb1.pdb"
columns = ["record_name", "atom_number", "atom_name", "residue_name", "chain_id", "residue_number", "x", "y", "z", "occupancy", "b_factor", "element_symbol", "charge"]
df = pd.read_csv(pdb_file, delim_whitespace=True, names=columns)
def translate_atoms(structure, resname, translation_vector):
for model in structure:
for chain in model:
for residue in chain:
if residue.resname == resname:
for atom in residue:
atom.coord += translation_vector
# Read the original PDB file
pdb_file = "pdb1.pdb"
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)
# Define translation vector
translation_vector = [5.5, 0, 0]
# Translate all atoms of SER residue in x direction
translate_atoms(structure, "SER", translation_vector)
# Write the modified structure to a new PDB file
output_pdb = "modified_pdb1.pdb"
pdb_io = PDBIO()
pdb_io.set_structure(structure)
with open(output_pdb, 'w') as pdb_file:
for model in structure:
for chain in model:
for residue in chain:
for atom in residue:
atom_line = f"ATOM {atom.serial_number:>4d} {atom.element:>2s} {residue.resname:>4s} {chain.id:4s}{residue.id[1]:>4d} {atom.coord[0]:8.3f}{atom.coord[1]:8.3f}{atom.coord[2]:8.3f}{atom.occupancy:6.2f}{atom.bfactor:6.2f} {atom.element:>2s}\n"
pdb_file.write(atom_line)
pdb_file.write("END\n")
print("Modified PDB file saved successfully.")
或者
from Bio.PDB import PDBParser, PDBIO
import pandas as pd
# Read the original PDB file
pdb_file = "pdb1.pdb"
pdb_parser = PDBParser(QUIET=True)
structure = pdb_parser.get_structure("original_pdb1", pdb_file)
# Create an empty DataFrame to store the atom information
atom_data = []
# Iterate over the structure and populate the DataFrame with atom information
for model in structure:
for chain in model:
for residue in chain:
for atom in residue:
atom_data.append([
atom.serial_number,
atom.element,
residue.resname,
chain.id,
residue.id[1],
atom.coord[0],
atom.coord[1],
atom.coord[2],
atom.occupancy,
atom.bfactor,
atom.element
])
# Create a DataFrame from the collected atom data
columns = ["atom_number", "atom_name", "residue_name", "chain_id", "residue_number", "x", "y", "z", "occupancy", "b_factor", "charge"]
df = pd.DataFrame(atom_data, columns=columns)
# Define translation vector
translation_vector = [5.5, 0, 0]
# Translate all atoms of SER residue in x direction
ser_mask = df["residue_name"] == "SER"
df.loc[ser_mask, ["x", "y", "z"]] += translation_vector
# Save the modified DataFrame to a new PDB file
output_pdb = "modified_pdb1.pdb"
output_pdb = "modified_pdb1.pdb"
# Save the DataFrame to a new PDB file
output_pdb = "modified_pdb1.pdb"
with open(output_pdb, 'w') as pdb_file:
for index, row in df.iterrows():
atom_line = f"ATOM {row['atom_number']:>4d} {row['atom_name']:>2s} {row['residue_name']:>4s} {row['chain_id']:4s}{row['residue_number']:>4d} {row['x']:8.3f}{row['y']:8.3f}{row['z']:8.3f}{row['occupancy']:6.2f}{row['b_factor']:6.2f} {row['charge']:>2s}\n"
pdb_file.write(atom_line)
pdb_file.write("END\n")
print("Modified PDB file saved successfully.")