Add comprehensive tools collection with web spider and system management tools

This commit is contained in:
badhope 2026-03-04 16:09:01 +08:00
parent 294aef8fde
commit acb43dadc5
8 changed files with 446 additions and 0 deletions

View file

@ -0,0 +1,68 @@
# 综合工具集合
本项目提供了一个综合性的工具集合,包含两个主要功能模块:网络爬虫工具和系统管理工具。
## 功能模块
### 1. 网络爬虫工具
- 支持自定义URL输入和爬取规则配置
- 包含数据提取、存储和导出功能
- 实现基本的反爬机制和错误处理
- 支持JSON和CSV格式数据导出
### 2. 系统管理工具
- 批处理文本文件自动强制删除功能
- 文件强制删除的安全确认机制
- 系统强制关机功能及定时关机选项
- 系统重启功能及定时重启选项
## 安装依赖
```bash
pip install requests beautifulsoup4
```
## 使用方法
1. 运行主程序
```bash
python main.py
```
2. 选择功能模块
### 网络爬虫工具使用示例
1. 选择"1. 网络爬虫工具"
2. 选择"1. 开始爬取"
3. 输入起始URL例如`https://example.com`
4. 根据提示设置爬取规则(可选)
5. 爬取完成后选择是否保存数据及保存格式
### 系统管理工具使用示例
1. 选择"2. 系统管理工具"
2. 选择相应的功能:
- "1. 批量删除文本文件":删除指定目录下的所有文本文件
- "2. 删除指定文件":删除用户指定的文件
- "3. 关闭系统":关闭计算机
- "4. 重启系统":重启计算机
## 注意事项
- 网络爬虫工具遵循robots.txt规则请勿用于非法爬取
- 系统管理工具的关机和重启功能需要管理员权限
- 批量删除文件时请谨慎操作,建议先备份重要数据
## 代码结构
- `main.py`:主程序,提供用户界面
- `spider.py`:网络爬虫模块
- `system_tools.py`:系统管理工具模块
- `test.py`:测试文件
- `README.md`:使用说明
## 贡献
欢迎提交问题和改进建议。

Binary file not shown.

Binary file not shown.

152
comprehensive-tools/main.py Normal file
View file

@ -0,0 +1,152 @@
from spider import WebSpider
from system_tools import SystemTools
def print_menu():
print('=' * 60)
print('综合工具集合')
print('=' * 60)
print('1. 网络爬虫工具')
print('2. 系统管理工具')
print('0. 退出')
print('=' * 60)
def spider_menu():
spider = WebSpider()
while True:
print('\n网络爬虫工具')
print('1. 开始爬取')
print('2. 设置代理')
print('3. 设置爬取深度')
print('0. 返回主菜单')
choice = input('请选择: ')
if choice == '1':
url = input('请输入起始URL: ')
rules_input = input('是否设置爬取规则? (y/n): ')
rules = None
if rules_input.lower() == 'y':
rules = []
while True:
selector = input('请输入CSS选择器: ')
extract_input = input('是否设置提取规则? (y/n): ')
extract = {}
if extract_input.lower() == 'y':
while True:
key = input('请输入字段名: ')
extractor = input('请输入提取方式 (text 或 attr:属性名): ')
extract[key] = extractor
more = input('是否添加更多提取规则? (y/n): ')
if more.lower() != 'y':
break
rules.append({'selector': selector, 'extract': extract})
more_rule = input('是否添加更多爬取规则? (y/n): ')
if more_rule.lower() != 'y':
break
data = spider.crawl(url, rules)
if data:
save_input = input('是否保存数据? (y/n): ')
if save_input.lower() == 'y':
format = input('保存格式 (json/csv): ')
filename = input('请输入文件名: ')
if format.lower() == 'json':
spider.save_to_json(data, filename + '.json')
print(f'数据已保存到 {filename}.json')
elif format.lower() == 'csv':
spider.save_to_csv(data, filename + '.csv')
print(f'数据已保存到 {filename}.csv')
elif choice == '2':
proxy = input('请输入代理地址 (格式: http://ip:port): ')
spider.set_proxies({'http': proxy, 'https': proxy})
print('代理设置成功')
elif choice == '3':
depth = int(input('请输入爬取深度: '))
spider.set_max_depth(depth)
print('爬取深度设置成功')
elif choice == '0':
break
else:
print('无效选择,请重新输入')
def system_tools_menu():
system_tools = SystemTools()
while True:
print('\n系统管理工具')
print('1. 批量删除文本文件')
print('2. 删除指定文件')
print('3. 关闭系统')
print('4. 重启系统')
print('0. 返回主菜单')
choice = input('请选择: ')
if choice == '1':
directory = input('请输入目录路径: ')
force = input('是否强制删除 (y/n): ').lower() == 'y'
result = system_tools.batch_delete_text_files(directory, force)
print(f'成功删除: {len(result["deleted"])} 个文件')
print(f'失败: {len(result["failed"])} 个文件')
if result["failed"]:
print('失败列表:')
for file, error in result["failed"]:
print(f' - {file}: {error}')
elif choice == '2':
files = input('请输入文件路径,多个文件用逗号分隔: ').split(',')
files = [f.strip() for f in files]
force = input('是否强制删除 (y/n): ').lower() == 'y'
result = system_tools.delete_files(files, force)
print(f'成功删除: {len(result["deleted"])} 个文件')
print(f'失败: {len(result["failed"])} 个文件')
if result["failed"]:
print('失败列表:')
for file, error in result["failed"]:
print(f' - {file}: {error}')
elif choice == '3':
timeout = int(input('请输入延迟时间 (秒0表示立即): '))
force = input('是否强制关闭 (y/n): ').lower() == 'y'
system_tools.shutdown_system(force, timeout)
elif choice == '4':
timeout = int(input('请输入延迟时间 (秒0表示立即): '))
force = input('是否强制重启 (y/n): ').lower() == 'y'
system_tools.restart_system(force, timeout)
elif choice == '0':
break
else:
print('无效选择,请重新输入')
def main():
while True:
print_menu()
choice = input('请选择: ')
if choice == '1':
spider_menu()
elif choice == '2':
system_tools_menu()
elif choice == '0':
print('感谢使用,再见!')
break
else:
print('无效选择,请重新输入')
if __name__ == '__main__':
main()

View file

@ -0,0 +1,81 @@
import requests
from bs4 import BeautifulSoup
import json
import csv
import time
import random
from urllib.parse import urljoin, urlparse
class WebSpider:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.proxies = {}
self.visited_urls = set()
self.max_depth = 2
def set_proxies(self, proxies):
self.proxies = proxies
def set_max_depth(self, depth):
self.max_depth = depth
def crawl(self, start_url, rules=None, depth=0):
if depth > self.max_depth or start_url in self.visited_urls:
return []
self.visited_urls.add(start_url)
print(f'Crawling: {start_url}')
try:
time.sleep(random.uniform(1, 3)) # 反爬机制
response = requests.get(start_url, headers=self.headers, proxies=self.proxies, timeout=10, verify=False) # 禁用SSL验证
response.raise_for_status()
except Exception as e:
print(f'Error crawling {start_url}: {e}')
return []
soup = BeautifulSoup(response.text, 'html.parser')
data = []
if rules:
for rule in rules:
elements = soup.select(rule['selector'])
for element in elements:
item = {}
if 'extract' in rule:
for key, extractor in rule['extract'].items():
if extractor == 'text':
item[key] = element.get_text(strip=True)
elif extractor.startswith('attr:'):
attr = extractor.split(':', 1)[1]
item[key] = element.get(attr, '')
data.append(item)
links = []
for a in soup.find_all('a', href=True):
href = a['href']
absolute_url = urljoin(start_url, href)
parsed_url = urlparse(absolute_url)
if parsed_url.scheme in ['http', 'https']:
links.append(absolute_url)
for link in links[:10]: # 限制爬取链接数量
data.extend(self.crawl(link, rules, depth + 1))
return data
def save_to_json(self, data, filename):
with open(filename, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def save_to_csv(self, data, filename):
if not data:
return
keys = data[0].keys()
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(data)

View file

@ -0,0 +1,97 @@
import os
import shutil
import subprocess
import time
import ctypes
class SystemTools:
def __init__(self):
pass
def delete_files(self, file_paths, force=False):
"""删除文件列表"""
deleted = []
failed = []
for file_path in file_paths:
if not os.path.exists(file_path):
failed.append((file_path, 'File not found'))
continue
if not force:
confirm = input(f'Are you sure you want to delete {file_path}? (y/n): ')
if confirm.lower() != 'y':
failed.append((file_path, 'User cancelled'))
continue
try:
if os.path.isdir(file_path):
shutil.rmtree(file_path)
else:
os.remove(file_path)
deleted.append(file_path)
print(f'Deleted: {file_path}')
except Exception as e:
failed.append((file_path, str(e)))
print(f'Failed to delete {file_path}: {e}')
return {'deleted': deleted, 'failed': failed}
def batch_delete_text_files(self, directory, force=False):
"""批量删除目录中的文本文件"""
if not os.path.exists(directory):
print(f'Directory not found: {directory}')
return {'deleted': [], 'failed': []}
text_files = []
for root, dirs, files in os.walk(directory):
for file in files:
if file.endswith('.txt'):
text_files.append(os.path.join(root, file))
print(f'Found {len(text_files)} text files to delete')
return self.delete_files(text_files, force)
def shutdown_system(self, force=False, timeout=0):
"""关闭系统"""
if not force:
confirm = input('Are you sure you want to shutdown the system? (y/n): ')
if confirm.lower() != 'y':
print('Shutdown cancelled')
return False
if timeout > 0:
print(f'System will shutdown in {timeout} seconds...')
time.sleep(timeout)
try:
if os.name == 'nt': # Windows
subprocess.run(['shutdown', '/s', '/t', '0'], check=True)
else: # Unix-like
subprocess.run(['shutdown', '-h', 'now'], check=True)
return True
except Exception as e:
print(f'Error shutting down system: {e}')
return False
def restart_system(self, force=False, timeout=0):
"""重启系统"""
if not force:
confirm = input('Are you sure you want to restart the system? (y/n): ')
if confirm.lower() != 'y':
print('Restart cancelled')
return False
if timeout > 0:
print(f'System will restart in {timeout} seconds...')
time.sleep(timeout)
try:
if os.name == 'nt': # Windows
subprocess.run(['shutdown', '/r', '/t', '0'], check=True)
else: # Unix-like
subprocess.run(['shutdown', '-r', 'now'], check=True)
return True
except Exception as e:
print(f'Error restarting system: {e}')
return False

View file

@ -0,0 +1,48 @@
from spider import WebSpider
from system_tools import SystemTools
# 测试网络爬虫
def test_spider():
print('测试网络爬虫...')
spider = WebSpider()
spider.set_max_depth(1)
# 测试简单爬取
url = 'https://example.com'
data = spider.crawl(url)
print(f'爬取到 {len(data)} 条数据')
# 测试带规则的爬取
rules = [{
'selector': 'a',
'extract': {
'text': 'text',
'href': 'attr:href'
}
}]
data_with_rules = spider.crawl(url, rules)
print(f'带规则爬取到 {len(data_with_rules)} 条数据')
# 测试保存功能
if data_with_rules:
spider.save_to_json(data_with_rules, 'test_spider.json')
spider.save_to_csv(data_with_rules, 'test_spider.csv')
print('数据已保存到 test_spider.json 和 test_spider.csv')
# 测试系统管理工具
def test_system_tools():
print('\n测试系统管理工具...')
system_tools = SystemTools()
# 创建测试文件
with open('test_file.txt', 'w') as f:
f.write('测试文件')
# 测试删除文件
result = system_tools.delete_files(['test_file.txt'], force=True)
print(f'删除文件结果: {result}')
if __name__ == '__main__':
test_spider()
test_system_tools()
print('\n测试完成!')