采集人物经历来佐证子平术
見《宋書·范曄傳》:“言之皆有實證,非為空談。”子平有較高的或然率,但如果沒有人物經(jīng)歷來佐證,就變成三教九流,成為“玄學”實在可惜。老外搞個mbti性格測試就巴巴說是科學,有智慧的老前輩總結的經(jīng)驗,因為不懂而無法傳承,散落在明間成為偷偷么么被人看不起,實在是看不過去。
有時候感覺西方人很笨,調研70多個家庭跟蹤他們的一生,然后給出結論發(fā)表論文。然后中國的學生就認為人家嚴謹有科學研究精神,何曾想過這70個樣本想對人類這么大基數(shù)根本不值得一提。另外一個視角,研究問題真的需要采用這么笨的方法嗎?梁湘潤大師等都說看一個人生的70%就不錯了,是的人的極限估計也差不多是這些,另外20~30%還需要靠國運、環(huán)境和個人修養(yǎng),人之一生豈是幾百頁的子平能講完的呢,那豈不是白活。
我是會一些計算機,另外對梁老的一些觀點特別認同,大部分人都是普通人,每天為了家計小常奔波,為了妻財子祿壽而焦慮,要想做人上人,你真的能承受他們所經(jīng)歷的嗎?
這篇博客主要是從百度百科上采集人物經(jīng)歷,后面再結合子平理論進行實證。
1 采集人物信息
# -*- coding: utf-8 -*- # @time : 2022/1/22 11:03 # @author : dzm # @dsec : 百度娛樂人物 from sqlalchemy.engine import create_engine from personspider.settings import MYSQL_CONFIG import scrapy from pyquery import PyQuery as pq from personspider.utils import str_util,person_util from personspider.items.baidu_person import BaiduPersonItem,BaiduPersonExperienceItem\,BaiduUrlItem,BaiduPersonRelationItem from personspider.service.baidu_service import BaiduUrlService import reclass yulespider(scrapy.Spider):name = 'baidu_yule'def start_requests(self):# 從數(shù)據(jù)庫中讀取鏈接engine = create_engine('mysql+pymysql://{}:{}@{}:3306/{}'.format(MYSQL_CONFIG['user'], MYSQL_CONFIG['password'],MYSQL_CONFIG['host'], MYSQL_CONFIG['db']),connect_args={'charset': 'utf8'}, pool_size=MYSQL_CONFIG['pool_size'])baiduUrlService = BaiduUrlService(engine)urls = baiduUrlService.get_urls()if urls:for url in urls:yield scrapy.Request(url=url.url, callback=self.parse,dont_filter=True)def parse(self, response):cur_url = response.request.urlcur_url = person_util.get_url(cur_url)soup = pq(response.text)# 人物基本信息basicInfo_blocks = soup('.basic-info .basicInfo-block')item = {}for basicInfo_block in basicInfo_blocks:size = pq(basicInfo_block)('dt').size()for i in range(size):name = pq(basicInfo_block)('dt:eq({})'.format(i)).text()name = str_util.clear(name)value = pq(basicInfo_block)('dd:eq({})'.format(i)).text()value = str_util.clear(value)item[name] = valueperson_item = self.pack_person(item, r'百度百科', cur_url)cur_name = person_item['cn_name']if 'birthday' in person_item.keys():print('中文名:{}, 出生日期:{},鏈接:{}'.format(person_item['cn_name'],person_item['birthday'],person_item['url']))else:print('中文名:{}, 鏈接:{}'.format(person_item['cn_name'],person_item['url']))# 出生日期不能為空,且需要有完整的年月日從中獲取人物經(jīng)歷valid_person = person_item and 'birthday' in person_item.keys() \and person_item['birthday'] \and re.match('\d{4}[年\-]\d{1,2}[月\-]\d{1,2}日?',person_item['birthday'])if valid_person:person_id = person_item['id']yield person_item# 人物經(jīng)歷paras = soup('.para')for para in paras:content = pq(para).text()if re.match('^\d{4}年',content) and not re.match('^\d{4}年\d{1,2}月\d{1,2}日',content):experiences = person_util.get_experience(content)if experiences:for experience in experiences:if experience and experience['experience']:exp_item = BaiduPersonExperienceItem()exp_item['id'] = str_util.gen_md5(experience)exp_item['person_id'] = person_idexp_item['year'] = experience['year']if 'month' in experience.keys():exp_item['month'] = experience['month']exp_item['experience'] = experience['experience']yield exp_item# 正在采集的鏈接curl_url_item = BaiduUrlItem()curl_url_item['id'] = str_util.gen_md5(cur_url)curl_url_item['url'] = cur_urlcurl_url_item['status'] = '1'curl_url_item['name'] = nameyield curl_url_item# 人物關系relations = soup('.relations li')if relations:for i in range(len(relations)):relation = relations[i]url = 'https://baike.baidu.com' + pq(relation)('a').attr('href')url = person_util.get_url(url)# 人物關系取值各有不同name = pq(relation)('.title').text()if name:tag = pq(relation)('.name').text()else:name = pq(relation)('.name').attr('title')tag = pq(relation)('.name').text()tag = tag[:len(tag)-len(name)]if valid_person:# 關系relation_item = BaiduPersonRelationItem()relation_item['one'] = person_idrelation_item['one_name'] = cur_namerelation_item['one_url'] = cur_urlrelation_item['two'] = str_util.gen_md5(url)relation_item['two_name'] = namerelation_item['two_url'] = urlrelation_item['relation'] = tagyield relation_item# 鏈接url_item = BaiduUrlItem()url_item['id'] = str_util.gen_md5(url)url_item['url'] = urlurl_item['status'] = '0'url_item['name'] = nameyield url_itemdef pack_person(self,content,source, url):if content:item = BaiduPersonItem()item['source'] = sourceitem['url'] = urlitem['id'] = str_util.gen_md5(url)for key in content.keys():if key == '中文名':item['cn_name'] = content[key]elif key == '外文名':item['en_name'] = content[key]elif key == '性別':item['sex'] = content[key]elif key == '國籍':item['nation'] = content[key]elif key == '出生日期':birthday = re.search(r'\d{4}[年\-]\d{1,2}[月\-]\d{1,2}日?',content[key], re.S)if birthday:item['birthday'] = birthday.group(0)elif key == '出生地':item['birthplace'] = content[key]elif key == '外文名':item['deathday'] = content[key]elif key == '身高':item['height'] = person_util.get_height(content[key])elif key == '畢業(yè)院校':item['school'] = content[key]elif key == '職業(yè)':item['occupation'] = content[key]elif key == '主要成就':item['achievements'] = content[key]elif key == '代表作品':item['representation'] = content[key]return itemreturn Noneif __name__ == '__main__':pass2 提取內容
import re from personspider.utils import str_utildef get_url(url):index = url.index('?') if '?' in url else Noneif index:return url[:index]else:return urldef get_experience(text):'''獲取經(jīng)歷'''pattern = r'(\d{4}年)'results = re.split(pattern,text,re.S)size = len(results)contents = []i = 1while i < size-1:# 年份year = results[i]# 經(jīng)歷experience = results[i+1].strip(',') #去掉首字母,experience = str_util.clear(experience)# 月result = re.search(r'\d{1,2}月', experience)if result:months = re.split(r'(\d{1,2}月)',experience,re.S)j = 1while j<len(months)-1:month = months[j]experience = months[j+1].strip(',')experience = str_util.clear(experience)contents.append({'year':year,'month':month,'experience':experience})j = j+2else:contents.append({'year':year,'experience':experience})i = i+2return contentsdef get_height(height):height = re.search(r'\d{1,4}(cm)?',height, re.S)if height:return height.group(0)else:return height import hashlib import redef gen_md5(item):'''將字符串轉md5'''m = hashlib.md5()md5 = str(item).encode('utf-8')m.update(md5)md5 = m.hexdigest()return md5def remove_xa0(value):'''\xa0 是不間斷空白符 '''return value.replace(u'\xa0',u'')def remove_quote(value):p = re.compile('\[[\d\-\]]+')return p.sub("",value)def remove_blank(value):return value.replace(' ','')def clear(value):value = remove_xa0(value)value = remove_quote(value)value = remove_blank(value)return value3 數(shù)據(jù)管道
from sqlalchemy.engine import create_engine from personspider.items.baidu_person import BaiduPersonItem,BaiduPersonExperienceItem\,BaiduUrlItem,BaiduPersonRelationItem from personspider.service.baidu_service import BaiduPersonService,BaiduPersonExperienceService\,BaiduPerson,BaiduPersonExperience,BaiduPersonRelation,BaiduUrl\,BaiduPersonRelationService,BaiduUrlServiceclass MysqlPipeline(object):def __init__(self, engine):self.baiduPersonService = BaiduPersonService(engine)self.baiduPersonExperienceService = BaiduPersonExperienceService(engine)self.baiduUrlService = BaiduUrlService(engine)self.baiduPersonRelationService = BaiduPersonRelationService(engine)def process_item(self, item, spider):if type(item) == BaiduPersonItem:record = BaiduPerson(**item)self.baiduPersonService.insert(record)elif type(item) == BaiduPersonExperienceItem:record = BaiduPersonExperience(**item)self.baiduPersonExperienceService.insert(record)elif type(item) == BaiduUrlItem:record = BaiduUrl(**item)self.baiduUrlService.insert(record)elif type(item) == BaiduPersonRelationItem:record = BaiduPersonRelation(**item)self.baiduPersonRelationService.insert(record)@classmethoddef from_settings(cls,settings):mysql_config = settings.get('MYSQL_CONFIG')engine = create_engine('mysql+pymysql://{}:{}@{}:3306/{}'.format(mysql_config['user'], mysql_config['password'],mysql_config['host'], mysql_config['db']),connect_args={'charset': 'utf8'}, pool_size=mysql_config['pool_size'])return cls(engine)4 寫數(shù)據(jù)庫
class BaiduPersonService(object):def __init__(self, engine):self.engine = engineSession = sessionmaker(engine)self.session = Session()self.emailService = EmailService()def exist(self, id):query = self.session.query(BaiduPerson).filter(BaiduPerson.id==id)return query.count()>0def insert(self, record):if not self.exist(record.id):try:record.create_time = datetime.datetime.now()self.session.add(record)self.session.commit()except Exception as e:title = r'{}寫入數(shù)據(jù)庫失敗'.format(record.cn_name)content = r'ERROR {}'.format(str(e))self.emailService.sendEmail(title,content)5 異常郵件發(fā)送
如果在解析過程中出現(xiàn)異常,總不需要時刻盯著吧,寫個發(fā)郵件告知我,豈不是很安逸
總結
以上是生活随笔為你收集整理的采集人物经历来佐证子平术的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 以计算机之眼观照生活 以人工智能之慧理解
- 下一篇: 【Java 后端接收前端的富文本数据,其