Python麻省理工学院课程开放式课程网页

2021-12-08 05:42:24 发布

您现在位置:Python中文网/ 问答频道 /正文

所以我用麻省理工学院的刮板程序来让它工作。以前有人做过,被告知它正常工作,编码正确。我只是要解决一些配置问题,应该写。你知道吗

首先是完整程序的链接:https://github.com/ByteAcademyCo/mit_scraper {看一下ocw\ U上传的文件和这三个主要文件 配置.py麻省理工大学开放式课程_刮刀.py阅读_imsmanifest.py文件你知道吗

如果你能看一看它,解释一下每件事都做了些什么,那就太神奇了,因为我还没有把自己算作python方面的专家。你知道吗

为课程生成模式。。。 从开放式课程网站下载并解压课程内容,并将其路径添加到课程文件夹 在ocw\u upload文件夹中的配置文件中。通过运行read_imsmanifest.py文件,每个imsmanifest文件 将解析课程内容文件夹中包含的内容 以及重要的信息,包括课程的组成部分, 课程资源的路径(但不是这些资源的实际数据)和其他规范 当然,将被写入到构建中指定的json文件中_架构.py. 用于组织每门课程内容的确切模式可以是 在架构中找到_dict.py公司你知道吗

我做了所有这些事情,对事情的运作有了一点想法,所以在我看来,我觉得 这部分代码有问题

导入xml.etree.ElementTree文件作为元素树 导入xmltodict 导入操作系统

导入配置 从build\u schema导入SchemaBuilder

班主任:

def __init__(self, builder, config, course_folder):
    self.full_path = os.path.join(config.COURSES_DIR, course_folder)
    self.file_full_path = os.path.join(self.full_path, config.IMS_MANIFEST)

    self._builder = builder
    self.imsmanifest_reader = IMSManifestReader(self._builder, self.file_full_path)


def construct(self):
    """
    Instruct reader to process input and pass specified data 
    to the builder for the product to be constructed.
    """
    self.imsmanifest_reader.process()

类IMSManifestReader:

XMLNAMESPACES = {
        "default": "http://www.imsglobal.org/xsd/imscp_v1p1",
        "xsi" : "http://www.w3.org/2001/XMLSchema-instance",
        "adlcp" : "http://www.adlnet.org/xsd/adlcp_rootv1p2",
        "cwsp" : "http://www.dspace.org/xmlns/cwspace_imscp",
        "ocw" : "https://ocw.mit.edu/xmlns/ocw_imscp",
        "lom" : "https://ocw.mit.edu/xmlns/LOM"
    }

def __init__(self, builder, imsmanifest):
    self._builder = builder
    self.imsmanifest = imsmanifest


def process(self):
    """
    Parse input, pass data to builder for construction of product.
    """
    root = self.find_imsmanifest_root(self.imsmanifest)
    item_parent_map = self.map_parents_items(self.imsmanifest)

    organizations = self.read_organizations(root, item_parent_map)
    resources = self.read_resources(root)
    metadata = self.read_subject_metadata(root)
    course_name = self.read_course_name(root)
    course_code = self.read_course_code(root)
    course_version = self.read_version(root)
    course_instructor = self.read_instructor(root)

    product = self._builder.get_product()



def find_imsmanifest_root(self, imsmanifest):
    """
    Read imsmanifest.xml into an element tree, find root.

    Args: imsmanifest - full path for imsmanifest file

    Returns: Root element of imsmanifest document's tree

    """

    print(imsmanifest)
    tree = ElementTree.parse(imsmanifest)
    root = tree.getroot()
    root_attribs = root.attrib
    return root

def map_parents_items(self, imsmanifest):
    """
    Map items to organization parent elements from 
    imsmanifest document tree.

    Args: imsmanifest - full path for imsmanifest file

    Returns: Dictionary with item ids as keys 
            and parent ids as values
    """

    tree = ElementTree.parse(imsmanifest)
    item_parent_map = {}
    for parent in tree.iter():
        for child in parent:
            if "item" in child.tag:
                item_parent_map[child.attrib["identifier"]] = parent.attrib["identifier"]

    return item_parent_map


def read_organizations(self, root, item_parent_map):
    """
    Get identifier, title, items of each organization
    and pass these attributes to organization builder.

    Args:   
        root - root of entire imsmanifest's document tree 
               which is an instance of the ElementTree wrapper class 
        item_parent_map - dictionary with item ids as keys
                             and their parents' ids as values 

    """

    organizations_element = root.find("default:organizations", self.XMLNAMESPACES)
    organization_elements = organizations_element.findall("default:organization", self.XMLNAMESPACES)
    for organization in organization_elements:
        org_id = organization.attrib["identifier"]
        title = organization.find("default:title", self.XMLNAMESPACES).text
        self._builder.add_organization(org_id, title)
        self.read_items(org_id, organization, item_parent_map)

def read_items(self, org_id, parent_node, item_parent_map):
    """
    Get identifier, title, parent item identifier,
    and other attributes of each item, pass these
    features to item builder

    Args: org_id - parent organization id
          parent_node - parent of item (organization or item)
          item_parent_map - dictionary with item ids as keys and their parents' 
                            ids as values

    """
    item_elements = parent_node.findall("default:item", self.XMLNAMESPACES)

    for item in item_elements:
        item_identifier = item.attrib["identifier"]

        item_info = {}
        for key, value in item.attrib.items():
            if key != "identifier":
                item_info[key] = value

        item_info["title"] = item.find("default:title", self.XMLNAMESPACES).text
        item_info["parent identifier"] = item_parent_map[str(item.attrib["identifier"])]

        self._builder.add_item(item_identifier, item_info, org_id)

        # #check for subitems
        if item.attrib["identifier"] in item_parent_map.values():
            self.read_items(org_id, item, item_parent_map)



def read_resources(self, root):
    """
    Get identifier, type, href, dependencies,
    metadata, files of each resource
    and pass these attributes to resource builder.

    Args:   
        root - root of entire imsmanifest's document tree 
               which is an instance of the ElementTree wrapper class 
    """

    resources_element = root.find("default:resources", self.XMLNAMESPACES)
    resource_elements = resources_element.findall("default:resource", self.XMLNAMESPACES)

    for resource in resource_elements:
        resource_id = resource.attrib["identifier"]

        #type, href are attributes of each resource element
        resource_info = {}
        for key, value in resource.attrib.items():
            if key != "identifier":
                resource_info[key] = value

        self._builder.add_resource(resource_id, resource_info)

    #   #check for dependencies
        dependency = resource.find("default:dependency", self.XMLNAMESPACES)
        if dependency != None:
            self.read_dependencies(resource_id, resource)

    #   #check for files
        file = resource.find("default:file", self.XMLNAMESPACES)
        if file != None:
            self.read_files(resource_id, resource)

    #   #check for metadata
        metadata = resource.find("default:metadata", self.XMLNAMESPACES)
        if metadata != None:
            self.read_document_metadata(resource_id, resource)



def read_dependencies(self, resource_identifier, resource_node):
    """
    Get all identifier references of dependencies for a given resource,
    pass resource_identifier and list of dependency identifierrefs to
    builder

    Args: 
        resource_identifier - identifier of parent resource
        resource_node - resource object element to be searched for 
                        dependency subelements
    """
    dependencies = []
    dependency_elements = resource_node.findall("default:dependency", self.XMLNAMESPACES)

    for dependency in dependency_elements:
        identifier_ref = dependency.attrib["identifierref"]
        dependencies.append(identifier_ref)

    self._builder.add_dependencies(resource_identifier, dependencies)


def read_files(self, resource_identifier, resource_node):
    """
    Get all hrefs of files for a given resource,
    pass resource_identifier and list of file hrefs to builder

    Args: 
        resource_identifier - identifier of parent resource
        resource_node - resource object element to be searched for 
                        dependency subelements
    """
    files = []

    file_elements = resource_node.findall("default:file", self.XMLNAMESPACES)

    for file in file_elements:
        href = file.attrib["href"]
        files.append(href)

    self._builder.add_files(resource_identifier, files)


def read_document_metadata(self, resource_identifier, resource):
    """
    Get metadata for a specified resource and pass to doc metadata builder

    Args: 
        resource_identifier - identifier of parent resource
        resource_node - resource object element to be searched for 
                        dependency subelements
    """
    metadata_element = resource.find("default:metadata", self.XMLNAMESPACES)

    metadata = {}
    namespace = self.XMLNAMESPACES["adlcp"] + "/location"
    metadata = metadata_element.find("adlcp:location", self.XMLNAMESPACES).text

    self._builder.add_doc_metadata(resource_identifier, namespace, metadata)


def read_subject_metadata(self, root):
    """
    Get subject level metadata element object
     and pass to subject metadata builder. 

    Args: root - root of entire imsmanifest's document tree 
                 which is an instance of the ElementTree wrapper class 
    """
    metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
    stringified_metadata = ElementTree.tostring(metadata_element, encoding="unicode")
    metadata_dict = xmltodict.parse(stringified_metadata, process_namespaces=True)

    self._builder.add_subject_metadata(metadata_dict)

def read_course_name(self, root):
    """
    Get the course name from metadata and pass it to metadata schema builder

    Args: root - root of entire imsmanifest's document tree 
                 which is an instance of the ElementTree wrapper class 
    """

    metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
    general_info_element = metadata_element.find("lom:general", self.XMLNAMESPACES)
    title_element = general_info_element.find("lom:title", self.XMLNAMESPACES)
    course_name = title_element.find("lom:string", self.XMLNAMESPACES).text

    self._builder.add_schema_metadata("name", course_name)

def read_course_code(self, root):
    """
    Get the course code from metadata and pass it to metadata schema builder

    Args: root - root of entire imsmanifest's document tree 
                 which is an instance of the ElementTree wrapper class 
    """
    metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
    general_info_element = metadata_element.find("lom:general", self.XMLNAMESPACES)
    identifier_element = general_info_element.find("lom:identifier", self.XMLNAMESPACES)
    course_code = identifier_element.find("lom:entry", self.XMLNAMESPACES).text

    self._builder.add_schema_metadata("courseCode", course_code)

def read_version(self, root):
    """
    Get the version of the course from metadata 
    and pass it to metadata schema builder

    Args: root - root of entire imsmanifest's document tree 
                 which is an instance of the ElementTree wrapper class 
    """
    metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
    lifecycle_element = metadata_element.find("lom:lifecycle", self.XMLNAMESPACES)
    version_element = lifecycle_element.find("lom:version", self.XMLNAMESPACES)
    course_version = version_element.find("lom:string", self.XMLNAMESPACES).text

    self._builder.add_schema_metadata("version", course_version)

def read_instructor(self, root):
    """
    Get the instructor of the course from metadata 
    and pass it to metadata schema builder

    Args: root - root of entire imsmanifest's document tree 
                 which is an instance of the ElementTree wrapper class 
    """
    metadata_element = root.find("default:metadata", self.XMLNAMESPACES)
    lifecycle_element = metadata_element.find("lom:lifecycle", self.XMLNAMESPACES)
    contribute_element = lifecycle_element.find("lom:contribute", self.XMLNAMESPACES)
    try:
        author = contribute_element.find("lom:entity", self.XMLNAMESPACES).text
    except:
        author = ''

    self._builder.add_schema_metadata("Instructor", author)

def main(): 对于范围(0,2389)中的course\u文件夹: course\u folder=str(课程文件夹) director=director(schemabilder(course\u folder)、config、course\u folder) 施工总监()

如果名称=='main': 主()

或配置.py

课程\u DIR=“../COURSES”

课程文件夹=[ 'C:/Users/ycberrehouma/Desktop/18-06-spring-2010'

]

IMS\U清单=“imsmanifest.xml文件““

构建_架构.py

导入架构目录 导入json 从文件导入addDirToIPFS

类SchemaBuilder(): """ 通过实现 生成器界面。 定义并跟踪它创建的表示。 提供检索产品的接口。你知道吗

"""
def __init__(self, dir):
    self._product = Product(dir)


def add_organization(self, org_identifier, org_title):

    """
    Add to dictionary of organizations with the organization's
    identifier as the key and a dictionary as the value 
    (with 'title' and 'items' as keys of this dictionary)

    Args:
        org_identifier - identifier of organization provided by imsmanifest
        org_title - title of organization specified in imsmanifest
    """
    info = {
        "title" : org_title,
        "items" : {}
    }

    self._product.organizations[org_identifier] = info

def add_item(self, item_identifier, item_info, org_id):

    """
    Add item and its associated info to 'items' within
    its parent organization in the dictionary of 
    organizations.

    Args:
        item_identifier - identifier of item provided by imsmanifest
        item_info - other info (the item's title, parent identifier, 
                    other attributes such as an identifierref, 
                    a sectionTemplateTag, etc.) 
        org_id - parent organization of item
    """

    item_dict = {}
    item_dict[item_identifier] = item_info
    self._product.organizations[org_id]['items'].update(item_dict)

def add_resource(self, resource_identifier, resource_info):
    """
    Add a resource to dictionary of resources 
    with the resource's identifier as the key
    and any other attributes as values.

    Args:
        resource_identifier - identifier of resource provided by imsmanifest
        resource_info - dict of other info (including the resource's type, hrefs)
                        with the name of attributes as keys 
                        and associated info as values

    """
    self._product.resources[resource_identifier] = resource_info

def add_dependencies(self, resource_identifier, dependencies):
    """
    Add key-value pair of dependencies of specified resource 
    to resource dictionary.

    Args:
        resource_identifier : identifier of resource provided by imsmanifest
        dependencies: list of identifierrefs for dependencies of specified resource
    """
    self._product.resources[resource_identifier]["dependencies"] = dependencies

def add_files(self, resource_identifier, files):
    """
    Add key-value pair of files for specified resource 
    to resource dictionary ("files" as key and list of
    hrefs of files as the value)

    Args:
        resource_identifier : identifier of resource provided by imsmanifest
        files: list of hrefs for files of specified resource
    """
    self._product.resources[resource_identifier]["files"] = files

def add_doc_metadata(self, resource_identifier, namespace, metadata):
    """
    Add key-value pair of metadata for specified resource
    to resource dictionary ("metadata" as key 
    and {namespace: metadata text} as value)

    Args: 
        resource_identifier : identifier of resource provided by imsmanifest
        namespace: full namespace of metadata (adlcp + location)
        metadata: text of metadata element
    """
    metadata_dict = {
            namespace : metadata
        }

    self._product.resources[resource_identifier]["metadata"] = metadata_dict

def add_subject_metadata(self, metadata):
    """
    Assign subject level metadata element (its subelements 
    and attributes) to an ordered dictionary of metadata.

    Args: metadata - ordered dictionary generated from
                     metadata element object 
                     (information includes metadata elements tag, 
                      subelement tags, attributes, text)
    """
    self._product.metadata = metadata

def add_schema_metadata(self, key, data):
    """
    Add key-value pair of metadata to dictionary of schema data.

    Args:
        key - Corresponds to key in schema with same name
        data - metadata bound to type
    """
    self._product.schema_data[key] = data

def get_product(self):
    self._product.to_schema()
    self._product.schema_to_file()

等级产品: def初始化(self,dir): 自我组织= {} 自我资源= {} self.metadata= {} self.u数据= {} self.schema模式=架构_dict.schema格式 自我目录=方向

def to_schema(self):
    #pass resources, organizations, metadata into schema
    self.schema["resources"] = self.resources
    self.schema["organizedMaterial"] = self.organizations
    self.schema["courseSpecifications"] = self.metadata

    #pass specific metadata (metadata common to every course) into schema
    self.course_name_to_schema()
    self.course_code_to_schema()
    self.version_to_schema()
    self.instructor_to_schema()

def course_name_to_schema(self):
    course_name = self.schema_data["name"]
    self.schema["name"] = course_name

def course_code_to_schema(self):
    course_code = self.schema_data["courseCode"]
    self.schema["courseCode"] = course_code

def version_to_schema(self):
    version = self.schema_data["version"]
    self.schema["version"] = version

def instructor_to_schema(self):
    instructor = self.schema_data["Instructor"]
    self.schema["hasCourseInstance"]["Instructor"]["name"] = instructor

    self.schema["IPFS_files"] = addDirToIPFS(self.dir)


def schema_to_file(self):
    with open('../courses/'+self.dir+'/schema.json', 'w') as outfile:
        json.dump(self.schema, outfile)

当我运行read_imsmanifest.xml文件我发现了这些错误 https://i.stack.imgur.com/yfieM.png

非常感谢你的意图,如果你需要更多的细节请告诉我