所以我用麻省理工学院的刮板程序来让它工作。以前有人做过,被告知它正常工作,编码正确。我只是要解决一些配置问题,应该写。你知道吗
首先是完整程序的链接:https://github.com/ByteAcademyCo/mit_scraper {看一下ocw\ U上传的文件和这三个主要文件 配置.py麻省理工大学开放式课程_刮刀.py阅读_imsmanifest.py文件你知道吗
如果你能看一看它,解释一下每件事都做了些什么,那就太神奇了,因为我还没有把自己算作python方面的专家。你知道吗
为课程生成模式。。。 从开放式课程网站下载并解压课程内容,并将其路径添加到课程文件夹 在ocw\u upload文件夹中的配置文件中。通过运行read_imsmanifest.py文件,每个imsmanifest文件 将解析课程内容文件夹中包含的内容 以及重要的信息,包括课程的组成部分, 课程资源的路径(但不是这些资源的实际数据)和其他规范 当然,将被写入到构建中指定的json文件中_架构.py. 用于组织每门课程内容的确切模式可以是 在架构中找到_dict.py公司你知道吗
我做了所有这些事情,对事情的运作有了一点想法,所以在我看来,我觉得 这部分代码有问题
导入xml.etree.ElementTree文件作为元素树 导入xmltodict 导入操作系统
导入配置 从build\u schema导入SchemaBuilder
班主任:
def __init__(self, builder, config, course_folder):
self.full_path = os.path.join(config.COURSES_DIR, course_folder)
self.file_full_path = os.path.join(self.full_path, config.IMS_MANIFEST)
self._builder = builder
self.imsmanifest_reader = IMSManifestReader(self._builder, self.file_full_path)
def construct(self):
"""
Instruct reader to process input and pass specified data
to the builder for the product to be constructed.
"""
self.imsmanifest_reader.process()
类IMSManifestReader:
XMLNAMESPACES = {
"default": "http://www.imsglobal.org/xsd/imscp_v1p1",
"xsi" : "http://www.w3.org/2001/XMLSchema-instance",
"adlcp" : "http://www.adlnet.org/xsd/adlcp_rootv1p2",
"cwsp" : "http://www.dspace.org/xmlns/cwspace_imscp",
"ocw" : "https://ocw.mit.edu/xmlns/ocw_imscp",
"lom" : "https://ocw.mit.edu/xmlns/LOM"
}
def __init__(self, builder, imsmanifest):
self._builder = builder
self.imsmanifest = imsmanifest
def process(self):
"""
Parse input, pass data to builder for construction of product.
"""
root = self.find_imsmanifest_root(self.imsmanifest)
item_parent_map = self.map_parents_items(self.imsmanifest)
organizations = self.read_organizations(root, item_parent_map)
resources = self.read_resources(root)
metadata = self.read_subject_metadata(root)
course_name = self.read_course_name(root)
course_code = self.read_course_code(root)
course_version = self.read_version(root)
course_instructor = self.read_instructor(root)
product = self._builder.get_product()
def find_imsmanifest_root(self, imsmanifest):
"""
Read imsmanifest.xml into an element tree, find root.
Args: imsmanifest - full path for imsmanifest file
Returns: Root element of imsmanifest document's tree
"""
print(imsmanifest)
tree = ElementTree.parse(imsmanifest)
root = tree.getroot()
root_attribs = root.attrib
return root
def map_parents_items(self, imsmanifest):
"""
Map items to organization parent elements from
imsmanifest document tree.
Args: imsmanifest - full path for imsmanifest file
Returns: Dictionary with item ids as keys
and parent ids as values
"""
tree = ElementTree.parse(imsmanifest)
item_parent_map = {}
for parent in tree.iter():
for child in parent:
if "item" in child.tag:
item_parent_map[child.attrib["identifier"]] = parent.attrib["identifier"]
return item_parent_map
def read_organizations(self, root, item_parent_map):
"""
Get identifier, title, items of each organization
and pass these attributes to organization builder.
Args:
root - root of entire imsmanifest's document tree
which is an instance of the ElementTree wrapper class
item_parent_map - dictionary with item ids as keys
and their parents' ids as values
"""
organizations_element = root.find("default:organizations", self.XMLNAMESPACES)
organization_elements = organizations_element.findall("default:organization", self.XMLNAMESPACES)
for organization in organization_elements:
org_id = organization.attrib["identifier"]
title = organization.find("default:title", self.XMLNAMESPACES).text
self._builder.add_organization(org_id, title)
self.read_items(org_id, organization, item_parent_map)
def read_items(self, org_id, parent_node, item_parent_map):
"""
Get identifier, title, parent item identifier,
and other attributes of each item, pass these
features to item builder
Args: org_id - parent organization id
parent_node - parent of item (organization or item)
item_parent_map - dictionary with item ids as keys and their parents'
ids as values
"""
item_elements = parent_node.findall("default:item", self.XMLNAMESPACES)
for item in item_elements:
item_identifier = item.attrib["identifier"]
item_info = {}
for key, value in item.attrib.items():
if key != "identifier":
item_info[key] = value
item_info["title"] = item.find("default:title", self.XMLNAMESPACES).text
item_info["parent identifier"] = item_parent_map[str(item.attrib["identifier"])]
self._builder.add_item(item_identifier, item_info, org_id)
# #check for subitems
if item.attrib["identifier"] in item_parent_map.values():
self.read_items(org_id, item, item_parent_map)
def read_resources(self, root):
"""
Get identifier, type, href, dependencies,
metadata, files of each resource
and pass these attributes to resource builder.
Args:
root - root of entire imsmanifest's document tree
which is an instance of the ElementTree wrapper class
"""
resources_element = root.find("default:resources", self.XMLNAMESPACES)
resource_elements = resources_element.findall("default:resource", self.XMLNAMESPACES)
for resource in resource_elements:
resource_id = resource.attrib["identifier"]
#type, href are attributes of each resource element
resource_info = {}
for key, value in resource.attrib.items():
if key != "identifier":
resource_info[key] = value
self._builder.add_resource(resource_id, resource_info)
# #check for dependencies
dependency = resource.find("default:dependency", self.XMLNAMESPACES)
if dependency != None:
self.read_dependencies(resource_id, resource)
# #check for files
file = resource.find("default:file", self.XMLNAMESPACES)
if file != None:
self.read_files(resource_id, resource)
# #check for metadata
metadata = resource.find("default:metadata", self.XMLNAMESPACES)
if metadata != None:
self.read_document_metadata(resource_id, resource)
def read_dependencies(self, resource_identifier, resource_node):
"""
Get all identifier references of dependencies for a given resource,
pass resource_identifier and list of dependency identifierrefs to
builder
Args:
resource_identifier - identifier of parent resource
resource_node - resource object element to be searched for
dependency subelements
"""
dependencies = []
dependency_elements = resource_node.findall("default:dependency", self.XMLNAMESPACES)
for dependency in dependency_elements:
identifier_ref = dependency.attrib["identifierref"]
dependencies.append(identifier_ref)
self._builder.add_dependencies(resource_identifier, dependencies)
def read_files(self, resource_identifier, resource_node):
"""
Get all hrefs of files for a given resource,
pass resource_identifier and list of file hrefs to builder
Args:
resource_identifier - identifier of parent resource
resource_node - resource object element to be searched for
dependency subelements
"""
files = []
file_elements = resource_node.findall("default:file", self.XMLNAMESPACES)
for file in file_elements:
href = file.attrib["href"]
files.append(href)
self._builder.add_files(resource_identifier, files)
def read_document_metadata(self, resource_identifier, resource):
"""
Get metadata for a specified resource and pass to doc metadata builder
Args:
resource_identifier - identifier of parent resource
resource_node - resource object element to be searched for
dependency subelements
"""
metadata_element = resource.find("default:metadata", self.XMLNAMESPACES)
metadata = {}
namespace = self.XMLNAMESPACES["adlcp"] + "/location"
metadata = metadata_element.find("adlcp:location", self.XMLNAMESPACES).text
self._builder.add_doc_metadata(resource_identifier, namespace, metadata)
def read_subject_metadata(self, root):
"""
Get subject level metadata element object
and pass to subject metadata builder.
Args: root - root of entire imsmanifest's document tree
which is an instance of the ElementTree wrapper class
"""
metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
stringified_metadata = ElementTree.tostring(metadata_element, encoding="unicode")
metadata_dict = xmltodict.parse(stringified_metadata, process_namespaces=True)
self._builder.add_subject_metadata(metadata_dict)
def read_course_name(self, root):
"""
Get the course name from metadata and pass it to metadata schema builder
Args: root - root of entire imsmanifest's document tree
which is an instance of the ElementTree wrapper class
"""
metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
general_info_element = metadata_element.find("lom:general", self.XMLNAMESPACES)
title_element = general_info_element.find("lom:title", self.XMLNAMESPACES)
course_name = title_element.find("lom:string", self.XMLNAMESPACES).text
self._builder.add_schema_metadata("name", course_name)
def read_course_code(self, root):
"""
Get the course code from metadata and pass it to metadata schema builder
Args: root - root of entire imsmanifest's document tree
which is an instance of the ElementTree wrapper class
"""
metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
general_info_element = metadata_element.find("lom:general", self.XMLNAMESPACES)
identifier_element = general_info_element.find("lom:identifier", self.XMLNAMESPACES)
course_code = identifier_element.find("lom:entry", self.XMLNAMESPACES).text
self._builder.add_schema_metadata("courseCode", course_code)
def read_version(self, root):
"""
Get the version of the course from metadata
and pass it to metadata schema builder
Args: root - root of entire imsmanifest's document tree
which is an instance of the ElementTree wrapper class
"""
metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
lifecycle_element = metadata_element.find("lom:lifecycle", self.XMLNAMESPACES)
version_element = lifecycle_element.find("lom:version", self.XMLNAMESPACES)
course_version = version_element.find("lom:string", self.XMLNAMESPACES).text
self._builder.add_schema_metadata("version", course_version)
def read_instructor(self, root):
"""
Get the instructor of the course from metadata
and pass it to metadata schema builder
Args: root - root of entire imsmanifest's document tree
which is an instance of the ElementTree wrapper class
"""
metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
lifecycle_element = metadata_element.find("lom:lifecycle", self.XMLNAMESPACES)
contribute_element = lifecycle_element.find("lom:contribute", self.XMLNAMESPACES)
try:
author = contribute_element.find("lom:entity", self.XMLNAMESPACES).text
except:
author = ''
self._builder.add_schema_metadata("Instructor", author)
def main(): 对于范围(0,2389)中的course\u文件夹: course\u folder=str(课程文件夹) director=director(schemabilder(course\u folder)、config、course\u folder) 施工总监()
如果名称=='main': 主()
或配置.py
课程\u DIR=“../COURSES”
课程文件夹=[ 'C:/Users/ycberrehouma/Desktop/18-06-spring-2010'
]
IMS\U清单=“imsmanifest.xml文件““
构建_架构.py
导入架构目录 导入json 从文件导入addDirToIPFS
类SchemaBuilder(): """ 通过实现 生成器界面。 定义并跟踪它创建的表示。 提供检索产品的接口。你知道吗
"""
def __init__(self, dir):
self._product = Product(dir)
def add_organization(self, org_identifier, org_title):
"""
Add to dictionary of organizations with the organization's
identifier as the key and a dictionary as the value
(with 'title' and 'items' as keys of this dictionary)
Args:
org_identifier - identifier of organization provided by imsmanifest
org_title - title of organization specified in imsmanifest
"""
info = {
"title" : org_title,
"items" : {}
}
self._product.organizations[org_identifier] = info
def add_item(self, item_identifier, item_info, org_id):
"""
Add item and its associated info to 'items' within
its parent organization in the dictionary of
organizations.
Args:
item_identifier - identifier of item provided by imsmanifest
item_info - other info (the item's title, parent identifier,
other attributes such as an identifierref,
a sectionTemplateTag, etc.)
org_id - parent organization of item
"""
item_dict = {}
item_dict[item_identifier] = item_info
self._product.organizations[org_id]['items'].update(item_dict)
def add_resource(self, resource_identifier, resource_info):
"""
Add a resource to dictionary of resources
with the resource's identifier as the key
and any other attributes as values.
Args:
resource_identifier - identifier of resource provided by imsmanifest
resource_info - dict of other info (including the resource's type, hrefs)
with the name of attributes as keys
and associated info as values
"""
self._product.resources[resource_identifier] = resource_info
def add_dependencies(self, resource_identifier, dependencies):
"""
Add key-value pair of dependencies of specified resource
to resource dictionary.
Args:
resource_identifier : identifier of resource provided by imsmanifest
dependencies: list of identifierrefs for dependencies of specified resource
"""
self._product.resources[resource_identifier]["dependencies"] = dependencies
def add_files(self, resource_identifier, files):
"""
Add key-value pair of files for specified resource
to resource dictionary ("files" as key and list of
hrefs of files as the value)
Args:
resource_identifier : identifier of resource provided by imsmanifest
files: list of hrefs for files of specified resource
"""
self._product.resources[resource_identifier]["files"] = files
def add_doc_metadata(self, resource_identifier, namespace, metadata):
"""
Add key-value pair of metadata for specified resource
to resource dictionary ("metadata" as key
and {namespace: metadata text} as value)
Args:
resource_identifier : identifier of resource provided by imsmanifest
namespace: full namespace of metadata (adlcp + location)
metadata: text of metadata element
"""
metadata_dict = {
namespace : metadata
}
self._product.resources[resource_identifier]["metadata"] = metadata_dict
def add_subject_metadata(self, metadata):
"""
Assign subject level metadata element (its subelements
and attributes) to an ordered dictionary of metadata.
Args: metadata - ordered dictionary generated from
metadata element object
(information includes metadata elements tag,
subelement tags, attributes, text)
"""
self._product.metadata = metadata
def add_schema_metadata(self, key, data):
"""
Add key-value pair of metadata to dictionary of schema data.
Args:
key - Corresponds to key in schema with same name
data - metadata bound to type
"""
self._product.schema_data[key] = data
def get_product(self):
self._product.to_schema()
self._product.schema_to_file()
等级产品: def初始化(self,dir): 自我组织= {} 自我资源= {} self.metadata= {} self.u数据= {} self.schema模式=架构_dict.schema格式 自我目录=方向
def to_schema(self):
#pass resources, organizations, metadata into schema
self.schema["resources"] = self.resources
self.schema["organizedMaterial"] = self.organizations
self.schema["courseSpecifications"] = self.metadata
#pass specific metadata (metadata common to every course) into schema
self.course_name_to_schema()
self.course_code_to_schema()
self.version_to_schema()
self.instructor_to_schema()
def course_name_to_schema(self):
course_name = self.schema_data["name"]
self.schema["name"] = course_name
def course_code_to_schema(self):
course_code = self.schema_data["courseCode"]
self.schema["courseCode"] = course_code
def version_to_schema(self):
version = self.schema_data["version"]
self.schema["version"] = version
def instructor_to_schema(self):
instructor = self.schema_data["Instructor"]
self.schema["hasCourseInstance"]["Instructor"]["name"] = instructor
self.schema["IPFS_files"] = addDirToIPFS(self.dir)
def schema_to_file(self):
with open('../courses/'+self.dir+'/schema.json', 'w') as outfile:
json.dump(self.schema, outfile)
当我运行read_imsmanifest.xml文件我发现了这些错误 https://i.stack.imgur.com/yfieM.png
非常感谢你的意图,如果你需要更多的细节请告诉我
回溯中的错误消息清楚地指出了问题所在:找不到文件
imsmanifest.xml
。路径不正确,文件/目录权限不允许访问,或者因为路径是相对路径,而您从错误的目录运行程序。你知道吗首先要检查路径
./courses\\0\\imsmanifest.xml
是否在执行程序的目录中有效。我注意到您的代码将courses目录定义为../courses
,而不是如回溯中所示的./courses
,因此您还需要检查它。你知道吗可以使用交互式Python会话进行测试。运行命令shell,切换到通常运行程序的目录,启动Python,然后键入
open('./courses\\0\\imsmanifest.xml')
。如果不起作用,请尝试:open(r'.\courses\0\ismanifest.xml')
。你知道吗在路径中混合使用正斜杠和反斜杠可能是问题所在。我无法证实这一点,但我相信这不重要。您可以通过确保在
config.py
中使用反斜杠来消除这个问题,即尝试COURSES_DIR = r"..\courses"
相关问题 更多 >
编程相关推荐