如何使用feedparser检测RSS频道中是否有新项目?

2024-06-02 07:21:39 发布

您现在位置:Python中文网/ 问答频道 /正文

我有以下代码。理解代码后,可以用 大写字母。我可以用insert or ignore测试通道中是否有新项,但是 我正在尝试使用feed.updated_parsed属性的更好的机制。为什么不管用 如预期的那样?你知道吗

from __future__ import unicode_literals
import feedparser
from sqlite3  import dbapi2 as sqlite
import sys, os
from datetime import datetime
from time import mktime
from daeutils import *
import re
import random
import optparse
import curses
import socket

def getActiveChannels():
  """Returns a list of active RSS channels"""
  con = sqlite.connect(connectionString)
  cur = con.cursor()
  cur.execute("select id, title, xmlurl, updated from channels")
  channels = cur.fetchall()
  cur.close()
  con.close()
  return channels

def getItemsForChannel(xmlUrl, lastUpdate):   
  socket.setdefaulttimeout(60)
  feedparserDictionary = feedparser.parse(xmlUrl)
  updatedTime = datetime.fromtimestamp(mktime(feedparserDictionary.feed.updated_parsed))
  lst = datetime.strptime(lastUpdate, "%Y-%m-%dT%H:%M:%S.%f")
  if updatedTime < lst:
    return [] # HERE NOT BEHAVING CORRECTLY, WHEN I COMMENT THIS LINE, THERE MAY BE A FEW ITEMS

  items = feedparserDictionary.entries
  print "There are new %d items" % len(items)
  return items

def setChannelUpdateTime(xmlUrl, tm):
  con = sqlite.connect(connectionString)
  cur = con.cursor()
  cur.execute("update channels set updated = :tm where xmlurl = :xmlUrl", locals())
  con.commit()
  print "updated successfully"
  cur.close()
  con.close()

if __name__ == "_main__":
   con = sqlite.connect(connectionString)
   for channel in getActiveChannels():
     channelId, channelTitle, channelXmlUrl, lastChannelUpdate = channel
     countOfNewItems = 0
     items = getItemsForChannel(channelXmlUrl, lastChannelUpdate)

     for item in items:
       title, link, description, priority, updated = item
       cur = con.cursor()
       cur.execute("insert or ignore into feeds \
              (title, link, description, read, updated, channelid) \
              values (?, ?, ?, ?, ?, ?)", \
              (title, link, description, 0, updated, channelId))

       countOfNewItems += cur.rowcount # WHICH ARE INSERTED HERE
       con.commit()
       cur.close()


     if countOfNewItems:
       print "Found new items"
       now = datetime.now().isoformat()
       if "." not in now:
         now = now + ".000000"
       setChannelUpdateTime(channelXmlUrl, now)

以下是sqlite中的两个表:

CREATE TABLE channels (id integer primary key, title string, text string, description string, type string, xmlurl string unique, htmlurl string, priority integer, active integer, deactivated integer, updated text);
CREATE TABLE feeds (id integer primary key, title string, link string unique, description string, read integer, priority integer, updated string, channelid integer, foreign key (channelid) references channels(id));

Tags: fromimportclosesqlitedatetimestringtitleitems
1条回答
网友
1楼 · 发布于 2024-06-02 07:21:39

我认为可能的错误是您试图比较feed上的updated字段,feed创建者可能不太支持这些feed。或时区格式,因为使用了isoformat或etc

无论如何,我认为比较每个条目的updated属性比比较feed属性要好得多,feed属性主要用于使feed缓存无效。你知道吗

下面是一个工作示例,其中我只返回函数中的新条目。你知道吗

import socket
from datetime import datetime, timedelta
from time import mktime

import feedparser
from pprint import pprint


def getItemsForChannel(xmlUrl, lastUpdate):
    lst = datetime.fromisoformat(lastUpdate)

    socket.setdefaulttimeout(60)

    parsed = feedparser.parse(xmlUrl)

    items = [entry for entry in parsed.entries if
             datetime.fromtimestamp(mktime(entry.updated_parsed)) > lst]
    print("There are new {} items".format(len(items)))
    return items


pprint(getItemsForChannel(
    'http://serverfault.com/feeds/tag/+or+linux+or+ubuntu+or+vim+or+rsync+or+gnome',
    (datetime.now() - timedelta(hours=3)).isoformat()
))

它对数据库值中最后一次解析的日期使用from/to iso格式,并对每个条目进行比较,而不是基于feed updated属性的全局比较。你知道吗

相关问题 更多 >