背景

最近在开发一款起名字的应用，最开始使用的康熙字典数据库是从github找的，使用了一段时间发现数据库中的汉字不全，部分高频汉字无法查询，最后决定自己去爬数据。

第一次爬取的是百度汉语，但有一个问题，名字的三才五格计算必须使用康熙汉字的笔画来计算，而百度汉语并没有给出汉字的康熙笔画，最终使用了汉程网的字典。

汉语字典

汉程网有汉语字典（新华字典）和康熙字典，这里使用了汉语字典，因为康熙字典并不是很全，比如我有一个同事的名字中有 “清”，但是康熙字典中并不能查到该字，所以这时候就需要使用汉语字典中的汉字了，不过好在汉语字典中也统计了康熙汉字。

康熙字典

汉语字典

数据爬取

数据库

CREATE TABLE `hancheng` (
  `word` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NOT NULL,
  `pinyin` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `ft_word` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `kx_word` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `kx_stroke` int DEFAULT NULL,
  `jt_bushou` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `jt_bushou_stroke` int DEFAULT NULL,
  `jt_stroke` int DEFAULT NULL,
  `ft_bushou` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `ft_bushou_stroke` int DEFAULT NULL,
  `ft_stroke` int DEFAULT NULL,
  `wuxing` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `jixiong` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  `is_changyong` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci DEFAULT NULL,
  PRIMARY KEY (`word`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_general_ci;

脚本

Main.py

import time
import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import re

from pymysql import DatabaseError

from Config import config
import pymysql


# '
# 『飞』 　繁体字：飛　异体字：亴蜚
# 拼音：fēi　注音：ㄈㄟ
# 简体部首：飞　部首笔画：3　总笔画：3繁体部首：飛　部首笔画：9　总笔画：9康熙字典笔画( 飛:9； )
# '◎ 民俗参考汉字五行：水　吉凶寓意：吉　是否为常用字：是姓名学：姓,多用男性'
# '

def getConnection():
    host = config["MYSQL"]["HOST"]
    port = int(config["MYSQL"]["PORT"])
    db = config["MYSQL"]["DATA_BASE_NAME"]
    user = config["MYSQL"]["USERNAME"]
    password = config["MYSQL"]["PASSWORD"]
    conn = pymysql.connect(host=host, port=port, db=db, user=user, password=password)
    return conn


def getAllCharacters():
    file_path = './data/hanzi.yaml'
    file = open(file_path, 'r', encoding="utf-8")
    character_set = set()

    for line in file.readlines():
        if '\n' in line:
            word = line.split('\n')[0]
            character_set.add(word)
    return character_set


def query(conn, word):
    url = 'http://tool.httpcn.com/Zi/So.asp?Tid=1&wd='

    wordQuote = urllib.parse.quote(word)

    response = urllib.request.urlopen(url + wordQuote, timeout=3)

    soup = BeautifulSoup(response, 'html.parser')
    one = soup.find('p', attrs={'class': 'text15'}).text
    #print(one)
    pinyin = None
    ft_word = None
    kx_word = None
    kx_stroke = 0
    jt_bushou = None
    jt_bushou_stroke = 0
    jt_stroke = 0
    ft_bushou = None
    ft_bushou_stroke = 0
    ft_stroke = 0
    wuxing = None
    jixiong = None
    is_changyong = None
    # one = "拼音：yùSetduyin('Duyin/yu4')　注音：ㄩˋSetduyin('Duyin/yu4')"

    if one.__contains__('Setduyin'):
        pinyinPatternList = re.findall('拼音：(.+?)Setduyin', one)  # 必须有
    else:
        pinyinPatternList = re.findall('拼音：(.+?)\s', one)  # 必须有

    if len(pinyinPatternList) >= 1:
        pinyin = pinyinPatternList[0]
        print("拼音为:" + pinyin)

    fTWordPatternList = re.findall('繁体字：(.+?)\s', one)

    if len(fTWordPatternList) >= 1:
        ft_word = fTWordPatternList[0]
        print("繁体字 :" + ft_word)


    jTBSPatternList = re.findall('简体部首：(.+?)\s', one)

    if len(jTBSPatternList) == 0:
        jTBSPatternList = re.findall('部首：(.+?)\s', one)

    if len(jTBSPatternList) >= 1:
        jt_bushou = jTBSPatternList[0]
        print("简体部首 :" + jt_bushou)

    fTBSPatternList = re.findall('繁体部首：(.+?)\s', one)
    if len(fTBSPatternList) >= 1:
        ft_bushou = fTBSPatternList[0]
        print("繁体部首 :" + ft_bushou)

    bsStrokePatternList = re.findall('部首笔画：(.+?)\s', one)
    if len(bsStrokePatternList) == 1:
        jt_bushou_stroke = int(bsStrokePatternList[0])
        print("简体部首笔画 :" + bsStrokePatternList[0])
    elif len(bsStrokePatternList) == 2:
        jt_bushou_stroke = int(bsStrokePatternList[0])
        ft_bushou_stroke = int(bsStrokePatternList[1])
        print("简体部首笔画 :" + bsStrokePatternList[0])
        print("繁体部首笔画 :" + bsStrokePatternList[1])

    strokePatternList = re.findall('总笔画：(\d+)', one)
    if len(strokePatternList) == 1:
        jt_stroke = int(strokePatternList[0])
        print("简体总笔画 :" + strokePatternList[0])
    elif len(strokePatternList) == 2:
        jt_stroke = int(strokePatternList[0])
        print("简体总笔画 :" + strokePatternList[0])
        ft_stroke = int(strokePatternList[1])
        print("繁体总笔画 :" + strokePatternList[1])

    kangxiPatternList = re.findall('康熙字典笔画\( (.+?):(.+?)；', one)
    if len(kangxiPatternList) >= 1:
        kangxi = kangxiPatternList[0]
        kx_word = kangxi[0]
        kx_stroke = int(kangxi[1])
        print("康熙字体 :" + kx_word)
        print("康熙笔画 :" + kangxi[1])

    two = soup.find('div', attrs={'class': 'text16'}).text
    #print(two)
    wuxingPatternList = re.findall('汉字五行：(.)', two)
    if len(wuxingPatternList) >= 1:
        wuxing = wuxingPatternList[0]
        print("汉字五行：" + wuxing)
    jixiongPatternList = re.findall('吉凶寓意：(.)', two)
    if len(jixiongPatternList) >= 1:
        jixiong = jixiongPatternList[0]
        print("吉凶寓意：" + jixiong)

    changyongPatternList = re.findall('是否为常用字：(.)', two)
    if len(changyongPatternList) >= 1:
        is_changyong = changyongPatternList[0]
        print("是否为常用字：" + is_changyong)

    if pinyin != None:
        sql_str = f"insert into hancheng (word,pinyin,ft_word,kx_word,kx_stroke,jt_bushou,jt_bushou_stroke,jt_stroke,ft_bushou,ft_bushou_stroke,ft_stroke,wuxing,jixiong,is_changyong) " \
                  f"values ('{word}', '{pinyin}', '{ft_word}', '{kx_word}', '{kx_stroke}', '{jt_bushou}', '{jt_bushou_stroke}', '{jt_stroke}', '{ft_bushou}', '{ft_bushou_stroke}', '{ft_stroke}', '{wuxing}', '{jixiong}', '{is_changyong}')"
        cursor = conn.cursor(pymysql.cursors.DictCursor)
        cursor.execute(sql_str)
        conn.commit()

if __name__ == '__main__':

    conn = getConnection()
    words = getAllCharacters()

    for word in words:
        try:
            query(conn, word)
        except Exception as e:
            print(e)
            if not (isinstance(e, DatabaseError) and e.args.__len__() >= 1 and e.args[0] == 1062):
                try:
                    sql_str = f"insert into hancheng_error2 (word) values ('{word}')"
                    cursor = conn.cursor(pymysql.cursors.DictCursor)
                    cursor.execute(sql_str)
                    conn.commit()
                except Exception as e2:
                    print(f"数据库已经存在 : " + word)
        print('----------------------------------------')
        time.sleep(3)

Config.py

import configparser
global config

config = configparser.ConfigParser()
config.read("setting.ini")
config.sections()

setting.ini

[MYSQL]
host = 192.168.0.204
PORT = 3306
USERNAME = xxxx
PASSWORD = xxxxxx
DATA_BASE_NAME = kangxidist

hanzi.yaml

数据量太大了，有需要的联系我（f573471902）

昼
雧
潼
雊
仔
载
毛
谐
瀯
铏
嶙
涉
...

爬取

数据爬取过程是在树莓派进行的，大概3秒一次，如果太频繁会拉区失败和限制ip

JuneLeo's Blog

科学起名 - python数据爬取

背景

汉语字典

数据爬取