这篇文章的题目，就是最后的前提，嘻嘻，亲测包有用的

get_digits. py

import requests  
import os  
import re  
import time  
import hashlib  
import random  
from bs4 import BeautifulSoup  
from urllib.parse import urljoin, urlparse  
from datetime import datetime  
  
# ======================  
# 1. 配置模块  
# ======================  
# 全局会话配置  
session = requests.Session()  
session.headers = {  
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',  
    'Referer': 'https://www.gdmu.edu.cn/'  
}  
  
# 文件存储配置  
news_folder = '学校要闻'  
image_folder = os.path.join(news_folder, 'images')  
  
# 初始化文章存储列表  
essay_list = []  
  
  
# ======================  
# 2. 爬取模块  
# ======================  
def create_folders():  
    """创建必要的存储文件夹"""  
    os.makedirs(news_folder, exist_ok=True)  
    os.makedirs(image_folder, exist_ok=True)  
  
def fetch_page(url):  
    """获取网页内容"""  
    try:  
        response = session.get(url, timeout=15)  
        response.encoding = 'utf-8'  
        response.raise_for_status()  
        return response  
    except requests.exceptions.RequestException as e:  
        print(f"请求失败：{str(e)}")  
        return None  
  
def get_click_count(clicktype, owner, clickid):  
    """调用点击量接口"""  
    api_url = f"https://www.gdmu.edu.cn/system/resource/code/news/click/dynclicks.jsp?clickid={clickid}&owner={owner}&clicktype={clicktype}"  
    try:  
        response = session.get(api_url,  timeout=15)  
        return response.text  if response.status_code  == 200 else "获取失败"  
    except Exception as e:  
        print(f"点击量接口异常：{str(e)}")  
        return "未知"  
  
  
def collect_article_links():  
    """动态收集文章链接"""  
    global essay_list  
    seen_urls = set()  
    current_url = 'https://www.gdmu.edu.cn/xxyw1.htm'  
    failure_count = 0  
  
    while True:  
        if (response := fetch_page(current_url)) is None:  
            failure_count += 1  
            if failure_count >= 3:  
                print("连续3次请求失败，终止分页抓取")  
                break  
            continue  
        failure_count = 0  
  
        # 提取文章链接  
        matches = re.findall(r'href=["\']((?:\.\./)?info/\d+/\d+\.htm)', response.text)  
        for match in matches:  
            full_url = urljoin(current_url, match)  
            if full_url not in seen_urls:  
                seen_urls.add(full_url)  
                essay_list.append(full_url)  
  
        # 寻找下一页  
        soup = BeautifulSoup(response.text, 'html.parser')  
        next_link = soup.find('a', string=re.compile(r'下页|下一页'))  
        if not next_link or not (next_href := next_link.get('href')):  
            break  
  
        # 解析下一页地址  
        current_url = urljoin(current_url, next_href)  
        if urlparse(current_url).path == urlparse(response.url).path:  
            break  
  
    print(f"共收集到 {len(essay_list)} 篇有效文章链接")  
  
  
# ======================  
# 3. 转换模块  
# ======================  
def download_image(img_url, article_url):  
    """高效图片下载处理"""  
    try:  
        full_url = urljoin(article_url, img_url)  
        parsed = urlparse(full_url)  
        clean_path = parsed.path.split('?')[0]  
        file_ext = os.path.splitext(clean_path)[1][:10]  
  
        hash_name = hashlib.md5(full_url.encode()).hexdigest()  
        filename = f"{hash_name}{file_ext if file_ext else '.jpg'}"  
        save_path = os.path.join(image_folder, filename)  
  
        if os.path.exists(save_path):  
            return f'images/{filename}'  
  
        # 单次请求处理  
        response = session.get(full_url, stream=True, timeout=20)  
        if response.status_code == 200 and 'image' in response.headers.get('Content-Type', ''):  
            with open(save_path, 'wb') as f:  
                for chunk in response.iter_content(chunk_size=8192):  
                    f.write(chunk)  
            return f'images/{filename}'  
        return None  
    except Exception as e:  
        print(f"图片处理异常：{str(e)}")  
        return None  
  
  
def html_to_md(html_content, article_url):  
    """优化的HTML转换处理"""  
    soup = BeautifulSoup(html_content, 'html.parser')  
  
    # 批量处理图片  
    img_mapping = {}  
    for img in soup.find_all('img', {'src': True}):  
        if (local_path := download_image(img['src'], article_url)):  
            alt = img.get('alt', '').strip() or "图片描述"  
            img_mapping[img['src']] = f"![{alt}]({local_path})"  
  
    # 转换处理  
    content = []  
    for element in soup.descendants:  
        if element.name == 'img' and element['src'] in img_mapping:  
            content.append(img_mapping[element['src']])  
        elif element.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:  
            content.append(f"\n{'#'*int(element.name[1])} {element.get_text().strip()}\n")  
        elif element.name == 'p' and element.get_text(strip=True):  
            content.append(element.get_text(' ', strip=True))  
  
    return '\n\n'.join([c for c in content if c.strip()])  
  
  
# ======================  
# 4. 处理模块  
# ======================  
def process_single_article(url):  
    """优化的文章处理流程"""  
    if (response := fetch_page(url)) is None:  
        return  
  
    soup = BeautifulSoup(response.text, 'html.parser')  
    crawl_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")  
  
    # 元数据提取  
    title = (soup.find(class_='arc-tit') or soup.title).get_text().strip()[:100]  
    edit_date = re.search(r'\d{4}-\d{2}-\d{2}', soup.get_text()).group() if re.search(r'\d{4}-\d{2}-\d{2}', soup.get_text()) else "未知日期"  
  
    # 点击量获取  
    click_params = re.search(r'_showDynClicks\(\s*"(\w+)"\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', response.text)  
    view_count = get_click_count(*click_params.groups()) if click_params else "未知"  
  
    # 内容处理  
    content_div = soup.find(class_='uarc-con') or soup.find('article')  
    if not content_div:  
        print(f"⛔ 未找到正文内容：{url}")  
        return  
  
    md_content = html_to_md(str(content_div), url)  
    image_links = [urljoin(url, img['src']) for img in content_div.find_all('img', {'src': True})]  
  
    # 文件保存  
    cleaned_title = re.sub(r'[^\w\u4e00-\u9fff-]', '', title)[:50]  
    filename = f"{cleaned_title}.md"  
    try:  
        with open(os.path.join(news_folder, filename), 'w', encoding='utf-8') as f:  
            f.write(f"# {title}\n\n")  
            f.write(f"**🕒 编辑时间**: {edit_date}  \n")  
            f.write(f"**📅 爬取时间**: {crawl_time}\n")  
            f.write(f"**👁️ 点击次数**: {view_count}\n")  
            f.write(f"**🔗 文章链接**: [{url}]({url})\n")  
            f.write(f"**🖼️ 图片链接**:\n" + "\n".join(f"- [{link}]({link})" for link in image_links) + "\n\n")  
            f.write(md_content)  
        print(f"✅ 成功保存：{filename}")  
    except Exception as e:  
        print(f"💾 文件保存失败：{str(e)}")  
  
  
# ======================  
# 5. 主程序  
# ======================  
def main():  
    create_folders()  
    collect_article_links()  
  
    try:  
        num_to_process = int(input(f"请输入处理数量（共{len(essay_list)}篇，默认全部）：") or len(essay_list))  
    except:  
        num_to_process = len(essay_list)  
  
    success = 0  
    for idx, url in enumerate(essay_list[:num_to_process], 1):  
        print(f"\n📝 处理进度：{idx}/{num_to_process} - {url}")  
        try:  
            process_single_article(url)  
            success += 1  
            if idx % 3 == 0:  
                delay = random.uniform(1.5, 3.5)  
                print(f"⏳ 随机等待 {delay:.1f}秒...")  
                time.sleep(delay)  
        except Exception as e:  
            print(f"⚠️ 处理异常：{str(e)}")  
  
    print(f"\n🎉 完成！成功率：{success/num_to_process:.1%}")  
  
  
if __name__ == "__main__":  
    main()

模块一 --代码详解

库的使用：

requests:
- 用途：用于发送HTTP请求，获取网页内容。
- 发挥作用的地方：在fetch_page函数中用于请求网页数据；在get_click_count函数中调用点击量接口。
os:
- 用途：提供了与操作系统交互的功能，如文件和目录操作。
- 发挥作用的地方：在create_folders函数中创建必要的存储文件夹；在处理模块中生成和管理文件路径。
re (正则表达式):
- 用途：用于字符串匹配、搜索、替换等操作。
- 发挥作用的地方：在收集文章链接时解析HTML内容找到文章链接；在提取点击量接口参数时使用；以及在文件名规范化处理中去除非法字符。
time:
- 用途：提供时间相关的功能，如暂停程序执行（sleep）。
- 发挥作用的地方：在主程序中随机等待一段时间以避免过快请求服务器引起封禁。
hashlib:
- 用途：提供多种哈希算法实现，可用于生成唯一标识符。
- 发挥作用的地方：在下载图片时生成唯一的文件名。
random:
- 用途：生成伪随机数。
- 发挥作用的地方：在主程序中生成一个随机的等待时间，以便在连续请求之间添加间隔。
BeautifulSoup from bs4:
- 用途：用于解析HTML和XML文档，方便地从页面中提取所需信息。
- 发挥作用的地方：在整个项目中广泛用于解析HTML文档，包括但不限于提取文章标题、编辑日期、内容及图片链接，并进行相应的转换和处理。
urljoin from urllib.parse:
- 用途：用于将相对URL转换为绝对URL。
- 发挥作用的地方：在收集文章链接和下载图片时，确保所有资源链接都是完整的绝对路径。
datetime:
- 用途：提供日期和时间相关的函数和类。
- 发挥作用的地方：记录每篇文章的爬取时间。

全局会话配置

通过 requests.Session() 创建了一个持久化的HTTP连接池，并设置了默认的请求头
通过 session.headers 其中 'User-Agent' 模拟了一个特定版本的Chrome浏览器访问网页的行为，以此来避免被服务器识别为自动化脚本而拒绝服务
'Referer' 头部字段指定了来源页面，这里设为广东医科大学的主页，这个信息有时会被服务器用来做安全验证或者统计分析，能防止一些反爬机制

文件存储配置

定义了两个变量 news_folder 和 image_folder 来指定新闻和图片的保存路径
image_folder 是在 news_folder 目录下创建的一个子目录，专门用于存放下载的图片资源
- os.path.join 这个能保证在各种操作系统中均能在文件路径中准确的创建文件（不同的操作系统可能使用不同的路径分隔符\ 或者/）

爬取模块

create_folders ()

创建必要的存储文件，使用 exist_ok=True 保证不会因为已经存在该文件夹而报错

fetch_page(url)

尝试通过session.get请求目标URL，并设置超时时间为15秒
设置响应编码为UTF-8，确保中文字符能够正确解析
检查HTTP状态码是否表示成功（200 OK），如果失败则抛出异常
如果请求过程中出现任何异常（如网络问题），会捕获并打印错误信息，然后返回 None

get_click_count(clicktype, owner, clickid)

这里是一个 逆向工程，是我找包，找到的，不过我还不太会，找到包后交给ai帮我逆向成功的

主要是，获取到上述的三个参数即可获得对应文章的 点击数

collect_article_links()

本人以为我观察到他是从 124 倒序到 1 已经天下无双了，结果还有狠招

动态地从指定网站抓取文章链接，并将这些链接存储在全局列表 essay_list 中，它通过循环遍历分页来实现这一目标，直到没有更多的页面为止

global essay_list：声明使用全局变量 essay_list 来存储找到的文章链接
seen_urls = set()：初始化一个集合 seen_urls，用于避免重复添加相同的链接
- 为什么不直接将文章的数组变为集合--> 不能保持元素的顺序，失去以后的扩展性，不支持索引访问和切片操作
current_url：设置初始URL为"https://www.gdmu.edu.cn/xxyw1.htm"，这是抓取的第一个页面
failure_count = 0：初始化失败计数器，用于记录连续请求失败的次数，超过三次，直接终止分页抓取，没有连续失败三次，则清空为 0
matches = re.findall(r'href=["\']((?:\.\./)?info/\d+/\d+\.htm)', response.text) 找到所有文章链接
- 下方的 if 判断是否有重复，没有则插入
BeautifulSoup(response.text, 'html.parser')：使用BeautifulSoup解析HTML内容
soup.find('a', text=re.compile(r'下页|下一页'))：查找包含"下页"或"下一页"文本的 <a> 标签
if not next_link or not (next_href:= next_link.get('href'))：如果没有找到下一页链接或者链接的 href 属性为空，则退出循环
urlparse(current_url).path == urlparse(response.url).path：检查新URL的路径是否与当前页面相同。如果相同，则说明已经到达最后一页，退出循环

转换模块

download_image(img_url, article_url)

full_url = urljoin(article_url, img_url)：如果 img_url 是相对路径，则使用 urljoin 函数将它与 article_url 结合起来，生成一个完整的URL
parsed = urlparse(full_url)：
- urlparse(full_url): 使用 urlparse 函数解析 full_url，返回一个包含URL各部分的对象（如scheme、netloc、path等）
  - scheme: 协议（如 http 或 https）
  - netloc: 网络位置（如 www.example.com）
  - path: 路径（如 /images/picture.jpg）
  - params: 参数（不常用）
  - query: 查询字符串（如 ?key=value）
  - fragment: 片段标识符（如 #section1）
clean_path = parsed.path.split('?')[0]：
- parsed.path: 获取URL中的路径部分（如 /images/picture.jpg）。
- split('?')[0]: 去除路径中的查询字符串部分（如果有）。因为有些图片URL可能带有查询参数（如尺寸信息），我们只需要路径部分来确定文件名和扩展名。
file_ext = os.path.splitext(clean_path)[1][:10] ：
- os.path.splitext(clean_path): 分割路径，返回一个元组 (root, ext)，其中 root 是文件路径（不含扩展名），ext 是文件扩展名（含点号）
  - 例如：对于路径 /images/picture.jpg，结果是 ('/images/picture', '.jpg')
- [1]: 取元组的第二个元素（即扩展名部分）
- [: 10]: 截取扩展名的前10个字符，以防扩展名过长导致问题（虽然一般不会超过10个字符）
这三段是生成唯一文件名
- hash_name = hashlib.md5(full_url.encode()).hexdigest()
- filename = f"{hash_name}{file_ext if file_ext else '.jpg'}"
- save_path = os.path.join(image_folder, filename)
最后一段，下载图片并写入文件夹

html_to_md(html_content, article_url)

将获取的相应，转换为 md 格式

处理模块

这部分较为简单就讲讲函数吧

soup.find(class_='arc-tit'): 尝试找到具有 class="arc-tit" 的元素，并提取其文本内容。如果没有找到，则使用 <title> 标签的内容作为标题
.get_text().strip()[: 100]: 获取文本内容并去除前后空白字符，限制长度为前100个字符
re.search(r'\d{4}-\d{2}-\d{2}', soup.get_text()): 在整个网页文本中查找符合 YYYY-MM-DD 格式的日期
.group(): 如果找到匹配项，则提取该日期；否则，返回 "未知日期"
click_params：是抓包，提取参数的

data_processor. py

import os  
from datetime import datetime  
import pandas as pd  
import matplotlib.pyplot as plt  
import matplotlib.dates as mdates  
from openpyxl.drawing.image import Image  
from openpyxl.styles import Alignment, Font  
from openpyxl.utils import get_column_letter  
  
# ======================  
# 1. 提取模块  
# ======================  
def parse_md_files(news_folder):  
    """解析Markdown文件提取元数据"""  
    articles = []  
  
    for filename in os.listdir(news_folder):  
        if not filename.endswith('.md'):  
            continue  
  
        file_path = os.path.join(news_folder, filename)  
        article_info = {  
            'title': '',  
            'edit_time': None,  
            'crawl_time': None,  
            'click_count': '未知',  
            'url': '',  
            'filename': filename,  
            'image_links': []  # 新增图片链接字段  
        }  
  
        try:  
            with open(file_path, 'r', encoding='utf-8') as f:  
                lines = f.readlines()  
                in_image_section = False  # 标记是否进入图片链接区域  
  
                # 提取标题（第一行）  
                if lines:  
                    title_line = lines[0].strip()  
                    article_info['title'] = title_line.lstrip('#').strip()  
  
                # 提取元数据  
                for line in lines[1:100]:  # 扩大扫描范围到前100行  
                    line = line.strip()  
  
                    # 图片链接提取逻辑  
                    if line.startswith('**🖼️ 图片链接**:'):  
                        in_image_section = True  
                        continue  
                    if in_image_section:  
                        if line.startswith('- ['):  
                            img_url = line.split('(')[-1].split(')')[0]  
                            article_info['image_links'].append(img_url.strip())  
                        else:  
                            in_image_section = False  # 退出图片链接区域  
  
                    # 提取编辑时间  
                    if line.startswith('**🕒 编辑时间**:'):  
                        edit_time_str = line.split(': ')[1].strip()  
                        try:  
                            article_info['edit_time'] = datetime.strptime(edit_time_str, "%Y-%m-%d")  
                        except ValueError:  
                            pass  
  
                    # 提取爬取时间  
                    elif line.startswith('**📅 爬取时间**:'):  
                        crawl_time_str = line.split(': ')[1].strip()  
                        try:  
                            article_info['crawl_time'] = datetime.strptime(crawl_time_str, "%Y-%m-%d %H:%M:%S")  
                        except ValueError:  
                            pass  
  
                    # 提取点击量  
                    elif line.startswith('**👁️ 点击次数**:'):  
                        article_info['click_count'] = line.split(': ')[1].strip()  
  
                    # 提取文章链接  
                    elif line.startswith('**🔗 文章链接**:'):  
                        url_part = line.split('(')[-1].split(')')[0]  
                        article_info['url'] = url_part.strip()  
  
                # 有效性检查  
                if article_info['edit_time'] and article_info['crawl_time']:  
                    articles.append(article_info)  
  
        except Exception as e:  
            print(f"解析文件 {filename} 出错: {str(e)}")  
  
    return articles  
  
# ======================  
# 2. 保存模块  
# ======================  
def save_to_excel(data, output_file, chart_image_path=None):  
    """将数据保存到Excel文件（增强版含全局居中对齐）"""  
    df = pd.DataFrame(data)  
  
    # 添加分组标识列  
    df['_group_id_'] = df.index  
    df = df.explode('image_links').reset_index(drop=True)  
    df['image_links'] = df['image_links'].fillna('无图片')  
  
    # 重命名字段（保持不变）  
    df = df.rename(columns={  
        'title': '文章标题',  
        'edit_time': '编辑时间',  
        'crawl_time': '爬取时间',  
        'click_count': '点击量',  
        'url': '文章链接',  
        'filename': '文件名',  
        'image_links': '图片链接'  
    })  
  
    # 格式化时间字段（保持不变）  
    df['编辑时间'] = df['编辑时间'].dt.strftime('%Y-%m-%d')  
    df['爬取时间'] = df['爬取时间'].dt.strftime('%Y-%m-%d  %H:%M:%S')  
  
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:  
        df.to_excel(writer, index=False, sheet_name='文章数据')  
        workbook = writer.book  
        worksheet = writer.sheets['文章数据']  
  
        # 创建居中对齐样式（新增核心代码）  
        center_alignment = Alignment(  
            horizontal='center',  
            vertical='center',  
            wrap_text=True  # 自动换行以适配合并单元格  
        )  
  
        # 应用全局对齐样式（新增）  
        for row in worksheet.iter_rows():  
            for cell in row:  
                cell.alignment = center_alignment  
  
                # 原有列宽设置逻辑（保持不变）  
        for col in df.columns:  
            col_idx = df.columns.get_loc(col) + 1  
            col_letter = get_column_letter(col_idx)  
            if col == '图片链接':  
                worksheet.column_dimensions[col_letter].width = 105  
            else:  
                max_len = max(df[col].astype(str).map(len).max(), len(str(col))) + 2  
                worksheet.column_dimensions[col_letter].width = min(max_len, 105)  
  
        # 合并单元格逻辑（保持原有功能）  
        for group_id, group in df.groupby('_group_id_'):  
            if len(group) > 1:  
                start_row = group.index.min() + 2  
                end_row = group.index.max() + 2  
                for col_num in [1, 2, 3, 4, 5, 6]:  
                    # 设置合并单元格对齐（新增）  
                    merged_cell = worksheet.cell(row=start_row, column=col_num)  
                    merged_cell.alignment = center_alignment  
                    worksheet.merge_cells(  
                        start_row=start_row,  
                        end_row=end_row,  
                        start_column=col_num,  
                        end_column=col_num  
                    )  
  
        # 超链接样式设置（保持颜色特征）  
        hyperlink_font = Font(color="0563C1", underline="single")  
        image_font = Font(color="FF69B4", underline="single")  
        for row in range(2, len(df) + 2):  
            # 文章链接  
            article_cell = worksheet[f'E{row}']  
            if df.at[row - 2, '文章链接']:  
                article_cell.font = hyperlink_font  
                # 图片链接  
            img_cell = worksheet[f'G{row}']  
            if df.at[row - 2, '图片链接'] != '无图片':  
                img_cell.font = image_font  
  
                # 图表插入逻辑（保持不变）  
        if chart_image_path and os.path.exists(chart_image_path):  
            try:  
                img = Image(chart_image_path)  
                img.width = 800  
                img.height = 500  
                worksheet.add_image(img, f'A{len(df) + 3}')  
            except Exception as e:  
                print(f"图表插入失败：{str(e)}")  
  
    print(f"成功保存带全局居中对齐的Excel文件: {output_file}")  
  
  
def truncate_title(title, max_length=7):  
    """截断长标题"""  
    return title[:max_length] + '..' if len(title) > max_length else title  
  
  
# ======================  
# 3. 图片模块  
# ======================  
def visualize_timeline(data):  
    """生成时间线可视化图表"""  
    # 设置中文字体  
    plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'WenQuanYi Micro Hei']  
    plt.rcParams['axes.unicode_minus'] = False  
  
    # 按点击量降序排序（保持纵向排列顺序）  
    sorted_data = sorted(data, key=lambda x: x['click_count'], reverse=True)  
  
    # 创建索引映射表（用于定位文章在图表中的纵向位置）  
    index_map = {a['filename']: idx for idx, a in enumerate(sorted_data)}  
  
    # 动态调整画布高度  
    plt.figure(figsize=(14, len(sorted_data) * 0.6 + 4))  
    ax = plt.gca()  
    ax.invert_yaxis()  # 高点击量在上方  
  
    # 设置日期格式  
    date_format = mdates.DateFormatter('%m-%d %H:%M')  
    ax.xaxis.set_major_formatter(date_format)  
    ax.xaxis.set_major_locator(mdates.AutoDateLocator())  
  
    # 按编辑时间排序（用于生成趋势线）  
    edit_sorted = sorted(data, key=lambda x: x['edit_time'])  
    edit_x = [a['edit_time'] for a in edit_sorted]  
    edit_y = [index_map[a['filename']] for a in edit_sorted]  # 保持点击量排序的纵向位置  
  
    # 按爬取时间排序（用于生成趋势线）  
    crawl_sorted = sorted(data, key=lambda x: x['crawl_time'])  
    crawl_x = [a['crawl_time'] for a in crawl_sorted]  
    crawl_y = [index_map[a['filename']] for a in crawl_sorted]  
  
    # 绘制时间趋势线（连接时间相邻的点）  
    ax.plot(edit_x, edit_y,  
            color='#2ecc71', linewidth=2, alpha=0.7,  
            marker='o', markersize=8, label='编辑时间趋势')  
    ax.plot(crawl_x, crawl_y,  
            color='#e74c3c', linewidth=2, alpha=0.7,  
            marker='s', markersize=8, label='爬取时间趋势')  
  
    # 绘制个体连接线  
    for idx, article in enumerate(sorted_data):  
        edit_time = article['edit_time']  
        crawl_time = article['crawl_time']  
  
        ax.plot(  
            [edit_time, crawl_time],  
            [idx, idx],  
            linestyle='--',  
            color='gray',  
            alpha=0.4,  
            linewidth=1  
        )  
  
        # 智能标注系统  
        truncated_title = truncate_title(article['title'])  
        annotation_text = f"{truncated_title}\n点击: {article['click_count']}"  
  
        # 根据时间间隔决定标注方向  
        time_gap = (crawl_time - edit_time).total_seconds()  
        if time_gap < 3600 * 6:  # 6小时内的短间隔  
            xy = (crawl_time, idx)  
            offset = (-10, 0)  
            ha = 'right'  
        else:  
            xy = (edit_time, idx)  
            offset = (10, 0)  
            ha = 'left'  
  
        ax.annotate(  
            annotation_text,  
            xy,  
            xytext=offset,  
            textcoords='offset points',  
            fontsize=8,  
            alpha=0.9,  
            ha=ha,  
            va='center',  
            arrowprops=dict(  
                arrowstyle='->',  
                color='gray',  
                alpha=0.3,  
                linewidth=0.8  
            )  
        )  
  
    # 图表美化  
    plt.yticks(range(len(sorted_data)), [truncate_title(a['title']) for a in sorted_data], fontsize=8)  
    plt.xlabel('时间轴', fontsize=12)  
    plt.ylabel('文章（点击量降序）', fontsize=12)  
    plt.title('文章时效性分析（时间趋势线+点击量排序）', fontsize=14, pad=20)  
    plt.grid(True, alpha=0.2, linestyle=':')  
  
    # 优化图例  
    plt.legend(  
        loc='upper left',  
        bbox_to_anchor=(1, 0.9),  
        frameon=True,  
        shadow=True,  
        fontsize=10  
    )  
  
    plt.tight_layout()  
    plt.savefig('数据分析图_时间趋势版.png', dpi=300, bbox_inches='tight')  
    print("已生成可视化图表: 数据分析图_时间趋势版.png")  
  
  
# ======================  
# 4.主程序  
# ======================  
def main():  
    # 参数配置  
    NEWS_FOLDER = '学校要闻'  
    EXCEL_OUTPUT = '广东医科大学--学校要闻.xlsx'  
    CHART_IMAGE = '数据分析图_时间趋势版.png'  # 新增图表路径常量  
  
    # 处理数据  
    articles_data = parse_md_files(NEWS_FOLDER)  
  
    if not articles_data:  
        print("未找到有效文章数据！")  
        return  
  
    # 生成可视化图表（必须在保存Excel之前）  
    valid_data = [  
        a for a in articles_data  
        if isinstance(a.get('click_count'), str) and a['click_count'].isdigit()  
    ]  
    for a in valid_data:  
        a['click_count'] = int(a['click_count'])  
    visualize_timeline(valid_data)  # 生成图表文件  
  
    # 保存到Excel（传递图表路径）  
    save_to_excel(articles_data, EXCEL_OUTPUT, CHART_IMAGE)  
  
  
if __name__ == "__main__":  
    main()

库的使用：

os 库：提供了文件操作的功能
datetime 库：处理日期和时间
pandas 库：一个强大的数据分析和处理库。它提供了一种灵活高效的DataFrame对象（即表格型数据结构），支持自动对齐数据、智能索引以及多种方式的数据输入输出。它非常适合用于数据清洗、转换、分析等任务
matplotlib.pyplot库：
- Matplotlib的一个子库，提供了类似MATLAB的绘图接口。通过它可以轻松地制作出高质量的图表，如线图、散点图、柱状图、饼图等。它是数据可视化的重要工具之一
matplotlib.dates库：
- 这部分属于 matplotlib 库，专门用于处理日期相关的绘图需求。它提供了多种方法来格式化日期坐标轴，使得在绘制含有时间信息的数据时更加方便
openpyxl.drawing.image库：
- openpyxl库允许你在Python中读写Excel 2010 xlsx/xlsm/xltx/xltm文件。其中openpyxl.drawing.image模块可以帮助你在Excel文档中插入图片，而Image类正是用于表示要插入到Excel中的图像对象
openpyxl.styles.Alignment, Font库
- 这些都是openpyxl库中的样式设置类，用于调整Excel单元格内的文本对齐方式(Alignment)和字体属性(Font)。例如，你可以设置文字是否居中显示，改变字体大小和颜色等
openpyxl.utils.get_column_letter库：
- 这个函数也是 openpyxl 库的一部分，用于将列的数字索引转换为Excel中的列字母标识符（如1变为'A'，2变为'B'等）。这对于动态操作Excel表中的特定列很有帮助

提取模块

os.listdir(news_folder):用于列出指定目录下的所有文件和文件夹，返回一个包含该目录下所有文件名和文件夹名的列表
filename.endswith('.md'):用于检查字符串是否以指定的后缀结束。如果是，则返回 True；否则返回 False
article_info：参数列表
readlines():读取文件的所有行或者选定范围
strip() 和 lstrip() 用于移除字符串开头和结尾的所有空白字符（包括空格、制表符 \t、换行符 \n 等）。也可以指定要移除的字符，strip('abc') 会移除字符串两端的'a'、'b'、'c'字符
startswith(prefix):用于检查字符串是否以指定的前缀（prefix）开始。如果是，则返回 True；否则返回 False。例如，if line.startswith('**🖼️ 图片链接**:'): 这行代码用于判断当前行是否为图片链接部分的起始行，从而决定是否进入图片链接的解析逻辑

保存模块

df = pd.DataFrame(data) 将数据转变为DataFrame，便于后续操作
使用 rename(columns={…}) 方法将原始字段名转换为更易读的中文名称，例如 'title' 变为 '文章标题' 等
dt.strftime() 方法重新格式化日期显示格式，以适应Excel中的展示需求

爬取母校网站的“学校要闻”栏目的内容，并写入excel进行数据分析

get_digits. py

模块一 --代码详解

全局会话配置

文件存储配置

爬取模块

create_folders ()

fetch_page(url)

get_click_count(clicktype, owner, clickid)

collect_article_links()

转换模块

download_image(img_url, article_url)

html_to_md(html_content, article_url)

处理模块

data_processor. py

提取模块

保存模块

三七

分享文章

文章目录