pythonץȡͼƬʾÀý
·¢²¼Ê±¼ä£º2020-12-17 17:27:04 ËùÊôÀ¸Ä¿£ºPython À´Ô´£ºÍøÂçÕûÀí
µ¼¶Á£º½ñÌìPHPÕ¾³¤Íø 52php.cn°ÑÊÕ¼¯×Ô»¥ÁªÍøµÄ´úÂë·ÖÏí¸ø´ó¼Ò£¬½ö¹©²Î¿¼¡£ #!/usr/bin/python# -*- coding:utf-8 -*-import reimport osimport urllib,urllib2,cookielibimport shutilfrom BeautifulSoup import BeautifulSoup
ÒÔÏ´úÂëÓÉPHPÕ¾³¤Íø 52php.cnÊÕ¼¯×Ô»¥ÁªÍø ÏÖÔÚPHPÕ¾³¤ÍøÐ¡±à°ÑËü·ÖÏí¸ø´ó¼Ò£¬½ö¹©²Î¿¼ #!/usr/bin/python # -*- coding:utf-8 -*- import re import os import urllib,urllib2,cookielib import shutil from BeautifulSoup import BeautifulSoup # ---- utils ---- def normalize_url(url): return "http://" + url if cmp(url[0:7],"http://") != 0 else url def safeDir(dir): return dir.replace('/','') # ---- variable ---- homepagePrefix = "http://60dxw.comww1.baisex.me/forum-47-" homepageSuffix = ".html" threadPrefix = "http://60dxw.comww1.baisex.me/" homedir = "baixingge" # ---- login ---- cookie = urllib2.HTTPCookieProcessor(cookielib.CookieJar()) opener = urllib2.build_opener(cookie) # ---- file ---- if (os.path.exists(homedir) == False): os.mkdir(homedir) os.chdir(homedir) # ---- crawl ---- for page in range(1,25): pageUrl = '{0}{1}{2}'.format(homepagePrefix,page,homepageSuffix) # ---- mkdir ---- if (os.path.exists(str(page)) == False): os.mkdir(str(page)) os.chdir(str(page)) print pageUrl # ---- download ---- html_body = urllib.urlopen(pageUrl).read() soup = BeautifulSoup(html_body) # ---- extract ---- threaddUrls = [] urlRaws = soup.findAll('th',attrs = {'class' : ['new','common']}) urlPattern = re.compile(r'href="([^"]*)"') titlePattern = re.compile(r'>([^<]*)</a>') for urlRaw in urlRaws: h = urlPattern.search(str(urlRaw)) t = titlePattern.search(str(urlRaw)) threadUrl = h.group(1) threadTitle = t.group(1) if (os.path.exists(threadTitle) == False): os.mkdir(safeDir(threadTitle)) else: continue os.chdir(safeDir(threadTitle)) page_url = threadPrefix + threadUrl print "---->{0}".format(page_url) print "---->{0}".format(safeDir(threadTitle)) page_body = urllib.urlopen(page_url).read() page_soup = BeautifulSoup(page_body) imgPattern = re.compile(r'img src="([^"]*)" onload') i = imgPattern.findall(str(page_soup)) index = 0 for img in i: print "-------->{0}".format(img) imgSuffix = img[img.rindex('.'):] imgName = "{0}{1}".format(str(index),imgSuffix) urllib.urlretrieve(img,imgName,None) index += 1 os.chdir("../") os.chdir("../") ÒÔÉÏÄÚÈÝÓÉPHPÕ¾³¤Íø¡¾52php.cn¡¿ÊÕ¼¯ÕûÀí¹©´ó¼Ò²Î¿¼Ñо¿ Èç¹ûÒÔÉÏÄÚÈݶÔÄúÓаïÖú£¬»¶ÓÊղء¢µãÔÞ¡¢ÍƼö¡¢·ÖÏí¡£ £¨±à¼£ºÀî´óͬ£© ¡¾ÉùÃ÷¡¿±¾Õ¾ÄÚÈݾùÀ´×ÔÍøÂ磬ÆäÏà¹ØÑÔÂÛ½ö´ú±í×÷Õ߸öÈ˹۵㣬²»´ú±í±¾Õ¾Á¢³¡¡£ÈôÎÞÒâÇÖ·¸µ½ÄúµÄȨÀû£¬Ç뼰ʱÓëÁªÏµÕ¾³¤É¾³ýÏà¹ØÄÚÈÝ! |
Ïà¹ØÄÚÈÝ
- Pythonµ¼ÈëoracleÊý¾ÝµÄ·½·¨
- Python Pandas²¢ÅÅ»æÖÆÁ½¸öBARH
- ÀûÓÃPythonÔÚ±¾µØ¿ª·¢NeoÖÇÄܺÏÔ¼£¡
- python ¨C ¾ßÓÐÕÅÁ¿Á÷µÄÓïÒå·Ö¶Î ¨C Ëðʧº¯ÊýÖеÄValueErr
- ÈçºÎÖÆ×÷ÈýÖØµÈ¼Û×ֵ䣿
- ¼ì²âÎļþÁ÷ÖÐÌØ¶¨×Ö·ûºó´¥·¢
- python½Ó¿Ú²âÊÔ4-Êý¾Ý¿â»ñÈ¡²ÎÊý
- ÈçºÎÔÚPandasÖеÄgroupsByÖб£ÁôûÓзÖ×éÁÐ
- python ¨C µ±ÈÎÎñ½á¹ûºÜ´óʱ,ÎÒÓ¦¸ÃÈçºÎʹÓÃCelery£¿
- PythonÖÐʹÓÃÖ§³ÖÏòÁ¿»úSVMʵ¼ù
ÍÆ¼öÎÄÕÂ
Õ¾³¤ÍƼö
- Pythonic·½Ê½£ºÀà»òÄ£¿éÖеÄʵÓú¯Êý
- pythonÖÆ×÷»¨°êÍøÃÀŮͼƬÅÀ³æ
- PythonÖÐÏ̱߳à³ÌÖ®threadingÄ£¿éµÄʹÓÃÏê½â
- python ¨C ÔÚmatplotlib¶¯»Ä£¿éÖйÜÀí¶¯Ì¬»æÍ¼
- Python¶à½ø³Ì·Ö¿é¶ÁÈ¡³¬´óÎļþµÄ·½·¨
- python´Óbash shellµ÷ÓÃ×Ô¶¨Ò庯Êý
- PythonÖм¸ÖÖ²Ù×÷×Ö·û´®µÄ·½·¨µÄ½éÉÜ
- 5 ÖÖʹÓà Python ´úÂëÇáËÉʵÏÖÊý¾Ý¿ÉÊÓ»¯µÄ·½·¨£¡
- PythonµÄÔªÀà(metaclass)µÄ¼òµ¥Ê¾Àý
- python¶ÁÎı¾ÖеĵÚÒ»ÁÐÄÚÈÝÉú³ÉBATÎļþ.py
ÈȵãÔĶÁ