<!-- #!/usr/bin/python # -*- coding: big5 -*-
import httplib,urllib,sys,traceback
def findSubBetween(body,startstr,endstr,start = 0):
- s = body.find(startstr,start) e = body.find(endstr,s+len(startstr)) return str(body[s+len(startstr):e])
class Detail(object):
- pass
class ETDBean(object):
- def setDownLoadETD(self,detd):
- self.detd = detd
- pass
- pass
- pass
- pass
- pass
- pass
- pass
- pass
- pass
class DownLoadETD(object):
def init(self,etdbean):
- self.etdbean = etdbean self.etdbean.setDownLoadETD(self) self.headers = {"User-Agent":"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)"} self.conn = httplib.HTTPConnection(self.etdbean.getweb()) pass
- self.conn.request("POST",body,urllib.urlencode(params),self.headers) return self.conn.getresponse().read()
- urnlist = [] while self.etdbean.hasnextlistrs():
- rs = self.etdbean.nextlistrs() for u in self.etdbean.findurnlist(rs):
- urnlist.append(u)
- rs = self.etdbean.nextlistrs() for u in self.etdbean.findurnlist(rs):
- all = len(urnlist) deataillist = [] for urn in urnlist:
- deataillist.append(self.etdbean.getdetailbean(urn)) sys.stderr.write("load detail %d/%d done \r"%(len(deataillist),all) )
- if not len(deataillist) : return fname = '%s-歷屆論文.csv'%self.etdbean.gettitle() outf = open(fname,'w') outf.write('年度,論文,頁數,作者,指導教授,論文識別碼,網址\n') for d in deataillist:
outf.write(','.join([str(d.year),d.subject,str(d.pages),d.author,d.advisor,d.urn,'http://%s/ETD-db/ETD-search-c/view_etd?URN=%s'%(self.etdbean.getweb(),d.urn)])) outf.write('\n')
- """ 1.getqueryurnlist 2.getdetail from getqueryurn 3.loaddeatail 4.exportcsv """ print self.etdbean.gettitle() urnlist = self.getqueryurnlist() print len(urnlist),'\n','\n'.join(urnlist) deataillist = self.loaddeatail(urnlist) self.exportCSV(deataillist) pass
class MCUETDBean(ETDBean):
def init(self,year1 = 1 ,year2 = 999 , department = '資訊管理學系碩士在職專班'):
- self.year1 = year1 self.year2 = year2 self.department = department self.place=0 self.maxplace=0 self.nextrs = None pass
- return "MCU-%s-%s-%s" %(self.department,self.year1,self.year2)
- return "ethesys.lib.mcu.edu.tw"
- return {'field6': 'year', 'queryy6': self.year1 , 'query6': self.year2 ,'boolean6':'AND' ,'field7': 'department_c', 'query7': self.department, 'boolean7':'AND'
,'num_terms':'9','place':0,'field1':'name_c','query1':}
- return rs.find("沒有記錄") == -1
return int(findSubBetween(rs,'檢索結果共<font color="red"><b>','</b>').strip()) pass
if self.place > self.maxplace : return False params = self.getlistparams() params['place'] = self.place rs = self.detd.getresponse("/ETD-db/ETD-search-c/search",params) if self.haslist(rs) == False : return False if not self.maxplace : self.maxplace = self.findlistcounts(rs) self.nextrs = rs self.place = self.place + 12 return True
- return self.nextrs pass
- urnlist = []
ssrt = '<a href="view_etd?URN=' start = rs.find(ssrt) while start > -1 :
urnlist.append(findSubBetween(rs,ssrt,'">',start).strip()) start = rs.find(ssrt,start+len(ssrt))
- d = Detail()
rs = self.detd.getresponse("/ETD-db/ETD-search-c/view_etd",{'URN':urn}).replace('\n',,9999999) d.author = findSubBetween(rs,'<tr><td align="left" valign="top">中文姓名</td><td align="left" valign="top">','</td>') d.year = findSubBetween(rs,'<tr><td align="left" valign="top">學年度</td><td align="left" valign="top">','</td>') d.subject = findSubBetween(rs,'<tr><td align="left" valign="top">論文名稱(中)</td><td align="left" valign="top">','</td>') d.pages = findSubBetween(rs,'<tr><td align="left" valign="top">頁數</td><td align="left" valign="top">','</td>') d.advisor = findSubBetween(rs,'<tr><td align="left" valign="top">口試委員</td><td align="left" valign="top">','- 指導教授').split('<li>')[-1].strip(' \n').replace('教授',) d.urn = urn return d pass
class SCUETDBean(ETDBean):
def init(self,year1 = 1 ,year2 = 999 , department = '法律學系'):
- self.year1 = year1 self.year2 = year2 self.department = department self.pg=1 self.maxpg=1 self.pgrecordlimit=999999999999 self.nextrs=None pass
- return "SCU-%s-%s-%s" %(self.department,self.year1,self.year2)
- return "etd.library.scu.edu.tw"
- return {'field3': 'year', 'query3': self.year2, 'queryy3': self.year1,'boolean3':'AND' ,'field2': 'department_c', 'query2': self.department, 'boolean2':'AND'
,'num_terms':'6','sep_num':self.pgrecordlimit,'field1':'name_c','query1':}
- return rs.find("查無任何資料") == -1
return int(findSubBetween(rs,'共 <b><font color=red>','</font></b> 筆資料').strip()) pass
if self.pg > self.maxpg : return False rs = None if self.pg == 1 :
- rs = self.detd.getresponse("/ETD-db/ETD-search-c/search",self.getlistparams()) if self.haslist(rs) == False : return False sumrecords = self.findlistcounts(rs) self.maxpg = (sumrecords / self.pgrecordlimit ) + min(1,sumrecords % self.pgrecordlimit)
- params = self.getlistparams() params['pg']=self.pg rs = self.detd.getresponse("/ETD-db/ETD-search-c/search",params)
- return self.nextrs pass
- urnlist = []
ssrt = '<input type="checkbox" name="flag" value="' start = rs.find(ssrt) while start > -1 :
urnlist.append(findSubBetween(rs,ssrt,'">',start).strip()) start = rs.find(ssrt,start+len(ssrt))
- d = Detail() rs = self.detd.getresponse("/ETD-db/ETD-search-c/view_etd",{'URN':urn})
d.author = findSubBetween(rs,'<td class="data_col_a">姓名</td><td class="data_col_b data_col_bgw">','(') d.year = findSubBetween(rs,'學期</td><td class="data_col_bgw">','學年度第') d.subject = findSubBetween(rs,'<td class="data_col_a">論文名稱</td><td colspan="3" class="data_col_bgw">','</td>').replace('
',) d.advisor = findSubBetween(rs,'依職稱與姓名排序</font> <li>','- 指導教授').replace('指導教授',
- d.urn = urn return d pass
try:
- """ MCUETDBean() DownLoadETD(MCUETDBean()) , """ for detd in [ DownLoadETD(MCUETDBean()) , DownLoadETD(SCUETDBean()) ]:
- detd.download()
except:
- traceback.print_exc()
PressKey = raw_input("\n\n\nPress Any key to exit...")
//-->