pythonץȡͼƬʾÀý
·¢²¼Ê±¼ä£º2020-12-17 17:27:04 ËùÊôÀ¸Ä¿£ºPython À´Ô´£ºÍøÂçÕûÀí
µ¼¶Á£º½ñÌìPHPÕ¾³¤Íø 52php.cn°ÑÊÕ¼¯×Ô»¥ÁªÍøµÄ´úÂë·ÖÏí¸ø´ó¼Ò£¬½ö¹©²Î¿¼¡£ #!/usr/bin/python# -*- coding:utf-8 -*-import reimport osimport urllib,urllib2,cookielibimport shutilfrom BeautifulSoup import BeautifulSoup
|
ÒÔÏ´úÂëÓÉPHPÕ¾³¤Íø 52php.cnÊÕ¼¯×Ô»¥ÁªÍø ÏÖÔÚPHPÕ¾³¤ÍøÐ¡±à°ÑËü·ÖÏí¸ø´ó¼Ò£¬½ö¹©²Î¿¼ #!/usr/bin/python
# -*- coding:utf-8 -*-
import re
import os
import urllib,urllib2,cookielib
import shutil
from BeautifulSoup import BeautifulSoup
# ---- utils ----
def normalize_url(url):
return "http://" + url if cmp(url[0:7],"http://") != 0 else url
def safeDir(dir):
return dir.replace('/','')
# ---- variable ----
homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-"
homepageSuffix = ".html"
threadPrefix = "http://60dxw.comww1.baisex.me/"
homedir = "baixingge"
# ---- login ----
cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar())
opener = urllib2.build_opener(cookie)
# ---- file ----
if (os.path.exists(homedir) == False):
os.mkdir(homedir)
os.chdir(homedir)
# ---- crawl ----
for page in range(1,25):
pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix)
# ---- mkdir ----
if (os.path.exists(str(page)) == False):
os.mkdir(str(page))
os.chdir(str(page))
print pageUrl
# ---- download ----
html_body = urllib.urlopen(pageUrl).read()
soup = BeautifulSoup(html_body)
# ---- extract ----
threaddUrls = []
urlRaws = soup.findAll('th',attrs = {'class' : ['new','common']})
urlPattern = re.compile(r'href="([^"]*)"')
titlePattern = re.compile(r'>([^<]*)</a>')
for urlRaw in urlRaws:
h = urlPattern.search(str(urlRaw))
t = titlePattern.search(str(urlRaw))
threadUrl = h.group(1)
threadTitle = t.group(1)
if (os.path.exists(threadTitle) == False):
os.mkdir(safeDir(threadTitle))
else:
continue
os.chdir(safeDir(threadTitle))
page_url = threadPrefix + threadUrl
print "---->{0}".format(page_url)
print "---->{0}".format(safeDir(threadTitle))
page_body = urllib.urlopen(page_url).read()
page_soup = BeautifulSoup(page_body)
imgPattern = re.compile(r'img src="([^"]*)" onload')
i = imgPattern.findall(str(page_soup))
index = 0
for img in i:
print "-------->{0}".format(img)
imgSuffix = img[img.rindex('.'):]
imgName = "{0}{1}".format(str(index),imgSuffix)
urllib.urlretrieve(img,imgName,None)
index += 1
os.chdir("../")
os.chdir("../")
ÒÔÉÏÄÚÈÝÓÉPHPÕ¾³¤Íø¡¾52php.cn¡¿ÊÕ¼¯ÕûÀí¹©´ó¼Ò²Î¿¼Ñо¿ Èç¹ûÒÔÉÏÄÚÈݶÔÄúÓаïÖú£¬»¶ÓÊղء¢µãÔÞ¡¢ÍƼö¡¢·ÖÏí¡£ £¨±à¼£ºÀî´óͬ£© ¡¾ÉùÃ÷¡¿±¾Õ¾ÄÚÈݾùÀ´×ÔÍøÂ磬ÆäÏà¹ØÑÔÂÛ½ö´ú±í×÷Õ߸öÈ˹۵㣬²»´ú±í±¾Õ¾Á¢³¡¡£ÈôÎÞÒâÇÖ·¸µ½ÄúµÄȨÀû£¬Ç뼰ʱÓëÁªÏµÕ¾³¤É¾³ýÏà¹ØÄÚÈÝ! |
Ïà¹ØÄÚÈÝ
- Pythonµ¼ÈëoracleÊý¾ÝµÄ·½·¨
- Python Pandas²¢ÅÅ»æÖÆÁ½¸öBARH
- ÀûÓÃPythonÔÚ±¾µØ¿ª·¢NeoÖÇÄܺÏÔ¼£¡
- python ¨C ¾ßÓÐÕÅÁ¿Á÷µÄÓïÒå·Ö¶Î ¨C Ëðʧº¯ÊýÖеÄValueErr
- ÈçºÎÖÆ×÷ÈýÖØµÈ¼Û×ֵ䣿
- ¼ì²âÎļþÁ÷ÖÐÌØ¶¨×Ö·ûºó´¥·¢
- python½Ó¿Ú²âÊÔ4-Êý¾Ý¿â»ñÈ¡²ÎÊý
- ÈçºÎÔÚPandasÖеÄgroupsByÖб£ÁôûÓзÖ×éÁÐ
- python ¨C µ±ÈÎÎñ½á¹ûºÜ´óʱ,ÎÒÓ¦¸ÃÈçºÎʹÓÃCelery£¿
- PythonÖÐʹÓÃÖ§³ÖÏòÁ¿»úSVMʵ¼ù
ÍÆ¼öÎÄÕÂ
Õ¾³¤ÍƼö
- Pythonic·½Ê½£ºÀà»òÄ£¿éÖеÄʵÓú¯Êý
- pythonÖÆ×÷»¨°êÍøÃÀŮͼƬÅÀ³æ
- PythonÖÐÏ̱߳à³ÌÖ®threadingÄ£¿éµÄʹÓÃÏê½â
- python ¨C ÔÚmatplotlib¶¯»Ä£¿éÖйÜÀí¶¯Ì¬»æÍ¼
- Python¶à½ø³Ì·Ö¿é¶ÁÈ¡³¬´óÎļþµÄ·½·¨
- python´Óbash shellµ÷ÓÃ×Ô¶¨Ò庯Êý
- PythonÖм¸ÖÖ²Ù×÷×Ö·û´®µÄ·½·¨µÄ½éÉÜ
- 5 ÖÖʹÓà Python ´úÂëÇáËÉʵÏÖÊý¾Ý¿ÉÊÓ»¯µÄ·½·¨£¡
- PythonµÄÔªÀà(metaclass)µÄ¼òµ¥Ê¾Àý
- python¶ÁÎı¾ÖеĵÚÒ»ÁÐÄÚÈÝÉú³ÉBATÎļþ.py
ÈȵãÔĶÁ
