python 从字符串获取元素 by id
我有一个程序,它试图将文件(或者多个文件)上传到一个图片上传网站,但我在解析返回的HTML时遇到了困难,想要提取出直接链接(这个链接在一个 <dd class="download"><input type="text" value="{这里是链接}"></dd>
标签里面)。
下面是我的代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import pycurl
import urllib
import urlparse
import xml.dom.minidom
import StringIO
import sys
import gtk
import os
import imghdr
import locale
import gettext
try:
import pynotify
except:
print "Install pynotify. It's whoasome!"
APP="Uploadir Uploader"
DIR="locale"
locale.setlocale(locale.LC_ALL, '')
gettext.bindtextdomain(APP, DIR)
gettext.textdomain(APP)
_ = gettext.gettext
##STRINGS
uploading = _("Uploading image to Uploadir.")
oneimage = _("1 image has been successfully uploaded.")
multimages = _("images have been successfully uploaded.")
uploadfailed = _("Unable to upload to Uploadir.")
class Uploadir:
def __init__(self, args):
self.images = []
self.urls = []
self.broadcasts = []
self.username=""
self.password=""
if len(args) == 1:
return
else:
for file in args:
if file == args[0] or file == "":
continue
if file.startswith("-u"):
self.username = file.split("-u")[1]
#print self.username
continue
if file.startswith("-p"):
self.password = file.split("-p")[1]
#print self.password
continue
self.type = imghdr.what(file)
self.images.append(file)
for file in self.images:
self.upload(file)
self.setClipBoard()
self.broadcast(self.broadcasts)
def broadcast(self, l):
try:
str = '\n'.join(l)
n = pynotify.Notification(str)
n.set_urgency(pynotify.URGENCY_LOW)
n.show()
except:
for line in l:
print line
def upload(self, file):
#Try to login
cookie_file_name = "/tmp/uploadircookie"
if ( self.username!="" and self.password!=""):
print "Uploadir authentication in progress"
l=pycurl.Curl()
loginData = [ ("username",self.username),("password", self.password), ("login", "Login") ]
l.setopt(l.URL, "http://uploadir.com/user/login")
l.setopt(l.HTTPPOST, loginData)
l.setopt(l.USERAGENT,"User-Agent: Uploadir (Python Image Uploader)")
l.setopt(l.FOLLOWLOCATION,1)
l.setopt(l.COOKIEFILE,cookie_file_name)
l.setopt(l.COOKIEJAR,cookie_file_name)
l.setopt(l.HEADER,1)
loginDataReturnedBuffer = StringIO.StringIO()
l.setopt( l.WRITEFUNCTION, loginDataReturnedBuffer.write )
if l.perform():
self.broadcasts.append("Login failed. Please check connection.")
l.close()
return
loginDataReturned = loginDataReturnedBuffer.getvalue()
l.close()
#print loginDataReturned
if loginDataReturned.find("<li>Your supplied username or password is invalid.</li>")!=-1:
self.broadcasts.append("Uploadir authentication failed. Username/password invalid.")
return
else:
self.broadcasts.append("Uploadir authentication successful.")
#cookie = loginDataReturned.split("Set-Cookie: ")[1]
#cookie = cookie.split(";",0)
#print cookie
c = pycurl.Curl()
values = [
("file", (c.FORM_FILE, file)),
("terms", "1"),
("submit", "submit")
]
buf = StringIO.StringIO()
c.setopt(c.URL, "http://uploadir.com/file/upload")
c.setopt(c.HTTPPOST, values)
c.setopt(c.COOKIEFILE, cookie_file_name)
c.setopt(c.COOKIEJAR, cookie_file_name)
c.setopt(c.WRITEFUNCTION, buf.write)
if c.perform():
self.broadcasts.append(uploadfailed+" "+file+".")
c.close()
return
self.result = buf.getvalue()
#print self.result
c.close()
doc = urlparse.urlparse(self.result)
print doc
self.urls.append(doc.getElementsByTagName("download")[0].childNodes[0].nodeValue)
def setClipBoard(self):
c = gtk.Clipboard()
c.set_text('\n'.join(self.urls))
c.store()
if len(self.urls) == 1:
self.broadcasts.append(oneimage)
elif len(self.urls) != 0:
self.broadcasts.append(str(len(self.urls))+" "+multimages)
if __name__ == '__main__':
uploadir = Uploadir(sys.argv)
处理HTML解析的代码在这里:
doc = urlparse.urlparse(self.result)
self.urls.append(doc.getElementsByTagName("download")[0].childNodes[0].nodeValue)
1 个回答
1
urlparse
模块和解析HTML没有关系。它的作用就是把一个网址拆分成几个部分,比如协议、网络地址、路径等等。例如:
>>> urlparse.urlparse("http://www.stackoverflow.com/questions/4699888") ParseResult(scheme='http', netloc='www.stackoverflow.com', path='/questions/4699888', params='', query='', fragment='')
如果你想解析HTML,可以试试BeautifulSoup这个工具。