diff --git a/comprehensive-tools/README.md b/comprehensive-tools/README.md new file mode 100644 index 0000000..649ef99 --- /dev/null +++ b/comprehensive-tools/README.md @@ -0,0 +1,68 @@ +# 综合工具集合 + +本项目提供了一个综合性的工具集合,包含两个主要功能模块:网络爬虫工具和系统管理工具。 + +## 功能模块 + +### 1. 网络爬虫工具 +- 支持自定义URL输入和爬取规则配置 +- 包含数据提取、存储和导出功能 +- 实现基本的反爬机制和错误处理 +- 支持JSON和CSV格式数据导出 + +### 2. 系统管理工具 +- 批处理文本文件自动强制删除功能 +- 文件强制删除的安全确认机制 +- 系统强制关机功能及定时关机选项 +- 系统重启功能及定时重启选项 + +## 安装依赖 + +```bash +pip install requests beautifulsoup4 +``` + +## 使用方法 + +1. 运行主程序 + +```bash +python main.py +``` + +2. 选择功能模块 + +### 网络爬虫工具使用示例 + +1. 选择"1. 网络爬虫工具" +2. 选择"1. 开始爬取" +3. 输入起始URL,例如:`https://example.com` +4. 根据提示设置爬取规则(可选) +5. 爬取完成后选择是否保存数据及保存格式 + +### 系统管理工具使用示例 + +1. 选择"2. 系统管理工具" +2. 选择相应的功能: + - "1. 批量删除文本文件":删除指定目录下的所有文本文件 + - "2. 删除指定文件":删除用户指定的文件 + - "3. 关闭系统":关闭计算机 + - "4. 重启系统":重启计算机 + +## 注意事项 + +- 网络爬虫工具遵循robots.txt规则,请勿用于非法爬取 +- 系统管理工具的关机和重启功能需要管理员权限 +- 批量删除文件时请谨慎操作,建议先备份重要数据 + +## 代码结构 + +- `main.py`:主程序,提供用户界面 +- `spider.py`:网络爬虫模块 +- `system_tools.py`:系统管理工具模块 +- `test.py`:测试文件 +- `README.md`:使用说明 + +## 贡献 + +欢迎提交问题和改进建议。 diff --git a/comprehensive-tools/__pycache__/main.cpython-314.pyc b/comprehensive-tools/__pycache__/main.cpython-314.pyc new file mode 100644 index 0000000..f92abd9 Binary files /dev/null and b/comprehensive-tools/__pycache__/main.cpython-314.pyc differ diff --git a/comprehensive-tools/__pycache__/spider.cpython-314.pyc b/comprehensive-tools/__pycache__/spider.cpython-314.pyc new file mode 100644 index 0000000..b0800d9 Binary files /dev/null and b/comprehensive-tools/__pycache__/spider.cpython-314.pyc differ diff --git a/comprehensive-tools/__pycache__/system_tools.cpython-314.pyc b/comprehensive-tools/__pycache__/system_tools.cpython-314.pyc new file mode 100644 index 0000000..f6a1d8f Binary files /dev/null and b/comprehensive-tools/__pycache__/system_tools.cpython-314.pyc differ diff --git a/comprehensive-tools/main.py b/comprehensive-tools/main.py new file mode 100644 index 0000000..1fe2b33 --- /dev/null +++ b/comprehensive-tools/main.py @@ -0,0 +1,152 @@ +from spider import WebSpider +from system_tools import SystemTools + +def print_menu(): + print('=' * 60) + print('综合工具集合') + print('=' * 60) + print('1. 网络爬虫工具') + print('2. 系统管理工具') + print('0. 退出') + print('=' * 60) + +def spider_menu(): + spider = WebSpider() + + while True: + print('\n网络爬虫工具') + print('1. 开始爬取') + print('2. 设置代理') + print('3. 设置爬取深度') + print('0. 返回主菜单') + + choice = input('请选择: ') + + if choice == '1': + url = input('请输入起始URL: ') + rules_input = input('是否设置爬取规则? (y/n): ') + + rules = None + if rules_input.lower() == 'y': + rules = [] + while True: + selector = input('请输入CSS选择器: ') + extract_input = input('是否设置提取规则? (y/n): ') + + extract = {} + if extract_input.lower() == 'y': + while True: + key = input('请输入字段名: ') + extractor = input('请输入提取方式 (text 或 attr:属性名): ') + extract[key] = extractor + + more = input('是否添加更多提取规则? (y/n): ') + if more.lower() != 'y': + break + + rules.append({'selector': selector, 'extract': extract}) + + more_rule = input('是否添加更多爬取规则? (y/n): ') + if more_rule.lower() != 'y': + break + + data = spider.crawl(url, rules) + + if data: + save_input = input('是否保存数据? (y/n): ') + if save_input.lower() == 'y': + format = input('保存格式 (json/csv): ') + filename = input('请输入文件名: ') + + if format.lower() == 'json': + spider.save_to_json(data, filename + '.json') + print(f'数据已保存到 {filename}.json') + elif format.lower() == 'csv': + spider.save_to_csv(data, filename + '.csv') + print(f'数据已保存到 {filename}.csv') + + elif choice == '2': + proxy = input('请输入代理地址 (格式: http://ip:port): ') + spider.set_proxies({'http': proxy, 'https': proxy}) + print('代理设置成功') + + elif choice == '3': + depth = int(input('请输入爬取深度: ')) + spider.set_max_depth(depth) + print('爬取深度设置成功') + + elif choice == '0': + break + + else: + print('无效选择,请重新输入') + +def system_tools_menu(): + system_tools = SystemTools() + + while True: + print('\n系统管理工具') + print('1. 批量删除文本文件') + print('2. 删除指定文件') + print('3. 关闭系统') + print('4. 重启系统') + print('0. 返回主菜单') + + choice = input('请选择: ') + + if choice == '1': + directory = input('请输入目录路径: ') + force = input('是否强制删除 (y/n): ').lower() == 'y' + result = system_tools.batch_delete_text_files(directory, force) + print(f'成功删除: {len(result["deleted"])} 个文件') + print(f'失败: {len(result["failed"])} 个文件') + if result["failed"]: + print('失败列表:') + for file, error in result["failed"]: + print(f' - {file}: {error}') + + elif choice == '2': + files = input('请输入文件路径,多个文件用逗号分隔: ').split(',') + files = [f.strip() for f in files] + force = input('是否强制删除 (y/n): ').lower() == 'y' + result = system_tools.delete_files(files, force) + print(f'成功删除: {len(result["deleted"])} 个文件') + print(f'失败: {len(result["failed"])} 个文件') + if result["failed"]: + print('失败列表:') + for file, error in result["failed"]: + print(f' - {file}: {error}') + + elif choice == '3': + timeout = int(input('请输入延迟时间 (秒,0表示立即): ')) + force = input('是否强制关闭 (y/n): ').lower() == 'y' + system_tools.shutdown_system(force, timeout) + + elif choice == '4': + timeout = int(input('请输入延迟时间 (秒,0表示立即): ')) + force = input('是否强制重启 (y/n): ').lower() == 'y' + system_tools.restart_system(force, timeout) + + elif choice == '0': + break + + else: + print('无效选择,请重新输入') + +def main(): + while True: + print_menu() + choice = input('请选择: ') + + if choice == '1': + spider_menu() + elif choice == '2': + system_tools_menu() + elif choice == '0': + print('感谢使用,再见!') + break + else: + print('无效选择,请重新输入') + +if __name__ == '__main__': + main() diff --git a/comprehensive-tools/spider.py b/comprehensive-tools/spider.py new file mode 100644 index 0000000..247efb0 --- /dev/null +++ b/comprehensive-tools/spider.py @@ -0,0 +1,81 @@ +import requests +from bs4 import BeautifulSoup +import json +import csv +import time +import random +from urllib.parse import urljoin, urlparse + +class WebSpider: + def __init__(self): + self.headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' + } + self.proxies = {} + self.visited_urls = set() + self.max_depth = 2 + + def set_proxies(self, proxies): + self.proxies = proxies + + def set_max_depth(self, depth): + self.max_depth = depth + + def crawl(self, start_url, rules=None, depth=0): + if depth > self.max_depth or start_url in self.visited_urls: + return [] + + self.visited_urls.add(start_url) + print(f'Crawling: {start_url}') + + try: + time.sleep(random.uniform(1, 3)) # 反爬机制 + response = requests.get(start_url, headers=self.headers, proxies=self.proxies, timeout=10, verify=False) # 禁用SSL验证 + response.raise_for_status() + except Exception as e: + print(f'Error crawling {start_url}: {e}') + return [] + + soup = BeautifulSoup(response.text, 'html.parser') + data = [] + + if rules: + for rule in rules: + elements = soup.select(rule['selector']) + for element in elements: + item = {} + if 'extract' in rule: + for key, extractor in rule['extract'].items(): + if extractor == 'text': + item[key] = element.get_text(strip=True) + elif extractor.startswith('attr:'): + attr = extractor.split(':', 1)[1] + item[key] = element.get(attr, '') + data.append(item) + + links = [] + for a in soup.find_all('a', href=True): + href = a['href'] + absolute_url = urljoin(start_url, href) + parsed_url = urlparse(absolute_url) + if parsed_url.scheme in ['http', 'https']: + links.append(absolute_url) + + for link in links[:10]: # 限制爬取链接数量 + data.extend(self.crawl(link, rules, depth + 1)) + + return data + + def save_to_json(self, data, filename): + with open(filename, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=2) + + def save_to_csv(self, data, filename): + if not data: + return + + keys = data[0].keys() + with open(filename, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=keys) + writer.writeheader() + writer.writerows(data) diff --git a/comprehensive-tools/system_tools.py b/comprehensive-tools/system_tools.py new file mode 100644 index 0000000..f800654 --- /dev/null +++ b/comprehensive-tools/system_tools.py @@ -0,0 +1,97 @@ +import os +import shutil +import subprocess +import time +import ctypes + +class SystemTools: + def __init__(self): + pass + + def delete_files(self, file_paths, force=False): + """删除文件列表""" + deleted = [] + failed = [] + + for file_path in file_paths: + if not os.path.exists(file_path): + failed.append((file_path, 'File not found')) + continue + + if not force: + confirm = input(f'Are you sure you want to delete {file_path}? (y/n): ') + if confirm.lower() != 'y': + failed.append((file_path, 'User cancelled')) + continue + + try: + if os.path.isdir(file_path): + shutil.rmtree(file_path) + else: + os.remove(file_path) + deleted.append(file_path) + print(f'Deleted: {file_path}') + except Exception as e: + failed.append((file_path, str(e))) + print(f'Failed to delete {file_path}: {e}') + + return {'deleted': deleted, 'failed': failed} + + def batch_delete_text_files(self, directory, force=False): + """批量删除目录中的文本文件""" + if not os.path.exists(directory): + print(f'Directory not found: {directory}') + return {'deleted': [], 'failed': []} + + text_files = [] + for root, dirs, files in os.walk(directory): + for file in files: + if file.endswith('.txt'): + text_files.append(os.path.join(root, file)) + + print(f'Found {len(text_files)} text files to delete') + return self.delete_files(text_files, force) + + def shutdown_system(self, force=False, timeout=0): + """关闭系统""" + if not force: + confirm = input('Are you sure you want to shutdown the system? (y/n): ') + if confirm.lower() != 'y': + print('Shutdown cancelled') + return False + + if timeout > 0: + print(f'System will shutdown in {timeout} seconds...') + time.sleep(timeout) + + try: + if os.name == 'nt': # Windows + subprocess.run(['shutdown', '/s', '/t', '0'], check=True) + else: # Unix-like + subprocess.run(['shutdown', '-h', 'now'], check=True) + return True + except Exception as e: + print(f'Error shutting down system: {e}') + return False + + def restart_system(self, force=False, timeout=0): + """重启系统""" + if not force: + confirm = input('Are you sure you want to restart the system? (y/n): ') + if confirm.lower() != 'y': + print('Restart cancelled') + return False + + if timeout > 0: + print(f'System will restart in {timeout} seconds...') + time.sleep(timeout) + + try: + if os.name == 'nt': # Windows + subprocess.run(['shutdown', '/r', '/t', '0'], check=True) + else: # Unix-like + subprocess.run(['shutdown', '-r', 'now'], check=True) + return True + except Exception as e: + print(f'Error restarting system: {e}') + return False diff --git a/comprehensive-tools/test.py b/comprehensive-tools/test.py new file mode 100644 index 0000000..a6d8606 --- /dev/null +++ b/comprehensive-tools/test.py @@ -0,0 +1,48 @@ +from spider import WebSpider +from system_tools import SystemTools + +# 测试网络爬虫 +def test_spider(): + print('测试网络爬虫...') + spider = WebSpider() + spider.set_max_depth(1) + + # 测试简单爬取 + url = 'https://example.com' + data = spider.crawl(url) + print(f'爬取到 {len(data)} 条数据') + + # 测试带规则的爬取 + rules = [{ + 'selector': 'a', + 'extract': { + 'text': 'text', + 'href': 'attr:href' + } + }] + data_with_rules = spider.crawl(url, rules) + print(f'带规则爬取到 {len(data_with_rules)} 条数据') + + # 测试保存功能 + if data_with_rules: + spider.save_to_json(data_with_rules, 'test_spider.json') + spider.save_to_csv(data_with_rules, 'test_spider.csv') + print('数据已保存到 test_spider.json 和 test_spider.csv') + +# 测试系统管理工具 +def test_system_tools(): + print('\n测试系统管理工具...') + system_tools = SystemTools() + + # 创建测试文件 + with open('test_file.txt', 'w') as f: + f.write('测试文件') + + # 测试删除文件 + result = system_tools.delete_files(['test_file.txt'], force=True) + print(f'删除文件结果: {result}') + +if __name__ == '__main__': + test_spider() + test_system_tools() + print('\n测试完成!')