Python 使用BeautifulSoup从JATS XML获取日期

Python 使用BeautifulSoup从JATS XML获取日期,python,xml-parsing,beautifulsoup,Python,Xml Parsing,Beautifulsoup,如何使用BeautifulSoup从JATS XML中提取日期(epub) <pub-date pub-type="epub"> <day>12</day> <month>09</month> <year>2011</year> </pub-date> 12 09 2011 → 2011-09-12 <pub-date pub-type="collection"> <year&

如何使用BeautifulSoup从JATS XML中提取日期(epub)

<pub-date pub-type="epub">
<day>12</day>
<month>09</month>
<year>2011</year>
</pub-date>

12
09
2011
→ 2011-09-12

<pub-date pub-type="collection">
<year>2011</year>
</pub-date>

2011

应该忽略。

在您的示例中,
发布类型
是发布日期的属性,该属性的值为
“epub”
。为了以标准格式(如JATSXML)浏览文档树,您需要使用lxml作为或

这里有两个使用lxml.etree的函数,它们仅在属性为“epub”时使用xpath解析候选日期字段。我特别基于PLOS的JATSXML格式,希望在这里得到应用

import datetime
import lxml.etree as et

def parse_article_date(date_element, date_format='%Y %m %d'):
    """
    For an article date element, convert XML fields to a datetime object
    :param date_format: string format used to convert to datetime object
    :return: datetime object based on XML date fields
    """
    day = ''
    month = ''
    year = ''
    for item in date_element.getchildren():
        if item.tag == 'day':
            day = item.text
        if item.tag == 'month':
            month = item.text
        if item.tag == 'year':
            year = item.text
    date = (year, month, day)
    string_date = ' '.join(date)
    date = datetime.datetime.strptime(string_date, date_format)

    return date

def get_article_pubdate(article_file, tag_path_elements=None, string_=False):
    """
    For a local article file, get its date of publication
    :param article_file: the xml file for a single article
    :param tag_path_elements: xpath search results of the location in the article's XML tree
    :param string_: defaults to False. If True, returns a date string instead of datetime object
    :return: dict of date type mapped to datetime object for that article
    """
    pub_date = {}
    if tag_path_elements is None:
        tag_path_elements = ("/",
                             "article",
                             "front",
                             "article-meta",
                             "pub-date")

    article_tree = et.parse(article_file)
    article_root = article_tree.getroot()
    tag_location = '/'.join(tag_path_elements)
    pub_date_fields = article_root.xpath(tag_location)
    print(pub_date_fields)

    for element in pub_date_fields:
        pub_type = element.get('pub-type')
        if pub_type == 'epub':
            date = parse_article_date(element)
            pub_date[pub_type] = date

    if string_:
        for key, value in pub_date.items():
            if value:
                pub_date[key] = value.strftime('%Y-%m-%d')  # you can set this to any date format

    return pub_date

在您的示例中,
pub type
是pub date的一个属性,该属性的值是
“epub”
。为了以标准格式(如JATSXML)浏览文档树,您需要使用lxml作为或

这里有两个使用lxml.etree的函数,它们仅在属性为“epub”时使用xpath解析候选日期字段。我特别基于PLOS的JATSXML格式,希望在这里得到应用

import datetime
import lxml.etree as et

def parse_article_date(date_element, date_format='%Y %m %d'):
    """
    For an article date element, convert XML fields to a datetime object
    :param date_format: string format used to convert to datetime object
    :return: datetime object based on XML date fields
    """
    day = ''
    month = ''
    year = ''
    for item in date_element.getchildren():
        if item.tag == 'day':
            day = item.text
        if item.tag == 'month':
            month = item.text
        if item.tag == 'year':
            year = item.text
    date = (year, month, day)
    string_date = ' '.join(date)
    date = datetime.datetime.strptime(string_date, date_format)

    return date

def get_article_pubdate(article_file, tag_path_elements=None, string_=False):
    """
    For a local article file, get its date of publication
    :param article_file: the xml file for a single article
    :param tag_path_elements: xpath search results of the location in the article's XML tree
    :param string_: defaults to False. If True, returns a date string instead of datetime object
    :return: dict of date type mapped to datetime object for that article
    """
    pub_date = {}
    if tag_path_elements is None:
        tag_path_elements = ("/",
                             "article",
                             "front",
                             "article-meta",
                             "pub-date")

    article_tree = et.parse(article_file)
    article_root = article_tree.getroot()
    tag_location = '/'.join(tag_path_elements)
    pub_date_fields = article_root.xpath(tag_location)
    print(pub_date_fields)

    for element in pub_date_fields:
        pub_type = element.get('pub-type')
        if pub_type == 'epub':
            date = parse_article_date(element)
            pub_date[pub_type] = date

    if string_:
        for key, value in pub_date.items():
            if value:
                pub_date[key] = value.strftime('%Y-%m-%d')  # you can set this to any date format

    return pub_date

谢谢你的辛勤工作。举个例子:如何查找xpath搜索结果?添加了一个print语句,以便在运行函数时可以看到它。表示xpath搜索结果的项是
pub\u date\u fields
。如果您的问题是如何处理远程托管的XML,那么您可以运行
get\u article\u pubdate(url)
,因为etree还可以解析远程文档。希望这有帮助!谢谢你的辛勤工作。举个例子:如何查找xpath搜索结果?添加了一个print语句,以便在运行函数时可以看到它。表示xpath搜索结果的项是
pub\u date\u fields
。如果您的问题是如何处理远程托管的XML,那么您可以运行
get\u article\u pubdate(url)
,因为etree还可以解析远程文档。希望这有帮助!