mirror of
https://github.com/codecrafters-io/build-your-own-x
synced 2026-07-03 01:09:25 +00:00
Merge acb43dadc5 into 294aef8fde
This commit is contained in:
commit
36813c1608
8 changed files with 446 additions and 0 deletions
68
comprehensive-tools/README.md
Normal file
68
comprehensive-tools/README.md
Normal file
|
|
@ -0,0 +1,68 @@
|
||||||
|
# 综合工具集合
|
||||||
|
|
||||||
|
本项目提供了一个综合性的工具集合,包含两个主要功能模块:网络爬虫工具和系统管理工具。
|
||||||
|
|
||||||
|
## 功能模块
|
||||||
|
|
||||||
|
### 1. 网络爬虫工具
|
||||||
|
- 支持自定义URL输入和爬取规则配置
|
||||||
|
- 包含数据提取、存储和导出功能
|
||||||
|
- 实现基本的反爬机制和错误处理
|
||||||
|
- 支持JSON和CSV格式数据导出
|
||||||
|
|
||||||
|
### 2. 系统管理工具
|
||||||
|
- 批处理文本文件自动强制删除功能
|
||||||
|
- 文件强制删除的安全确认机制
|
||||||
|
- 系统强制关机功能及定时关机选项
|
||||||
|
- 系统重启功能及定时重启选项
|
||||||
|
|
||||||
|
## 安装依赖
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install requests beautifulsoup4
|
||||||
|
```
|
||||||
|
|
||||||
|
## 使用方法
|
||||||
|
|
||||||
|
1. 运行主程序
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python main.py
|
||||||
|
```
|
||||||
|
|
||||||
|
2. 选择功能模块
|
||||||
|
|
||||||
|
### 网络爬虫工具使用示例
|
||||||
|
|
||||||
|
1. 选择"1. 网络爬虫工具"
|
||||||
|
2. 选择"1. 开始爬取"
|
||||||
|
3. 输入起始URL,例如:`https://example.com`
|
||||||
|
4. 根据提示设置爬取规则(可选)
|
||||||
|
5. 爬取完成后选择是否保存数据及保存格式
|
||||||
|
|
||||||
|
### 系统管理工具使用示例
|
||||||
|
|
||||||
|
1. 选择"2. 系统管理工具"
|
||||||
|
2. 选择相应的功能:
|
||||||
|
- "1. 批量删除文本文件":删除指定目录下的所有文本文件
|
||||||
|
- "2. 删除指定文件":删除用户指定的文件
|
||||||
|
- "3. 关闭系统":关闭计算机
|
||||||
|
- "4. 重启系统":重启计算机
|
||||||
|
|
||||||
|
## 注意事项
|
||||||
|
|
||||||
|
- 网络爬虫工具遵循robots.txt规则,请勿用于非法爬取
|
||||||
|
- 系统管理工具的关机和重启功能需要管理员权限
|
||||||
|
- 批量删除文件时请谨慎操作,建议先备份重要数据
|
||||||
|
|
||||||
|
## 代码结构
|
||||||
|
|
||||||
|
- `main.py`:主程序,提供用户界面
|
||||||
|
- `spider.py`:网络爬虫模块
|
||||||
|
- `system_tools.py`:系统管理工具模块
|
||||||
|
- `test.py`:测试文件
|
||||||
|
- `README.md`:使用说明
|
||||||
|
|
||||||
|
## 贡献
|
||||||
|
|
||||||
|
欢迎提交问题和改进建议。
|
||||||
BIN
comprehensive-tools/__pycache__/main.cpython-314.pyc
Normal file
BIN
comprehensive-tools/__pycache__/main.cpython-314.pyc
Normal file
Binary file not shown.
BIN
comprehensive-tools/__pycache__/spider.cpython-314.pyc
Normal file
BIN
comprehensive-tools/__pycache__/spider.cpython-314.pyc
Normal file
Binary file not shown.
BIN
comprehensive-tools/__pycache__/system_tools.cpython-314.pyc
Normal file
BIN
comprehensive-tools/__pycache__/system_tools.cpython-314.pyc
Normal file
Binary file not shown.
152
comprehensive-tools/main.py
Normal file
152
comprehensive-tools/main.py
Normal file
|
|
@ -0,0 +1,152 @@
|
||||||
|
from spider import WebSpider
|
||||||
|
from system_tools import SystemTools
|
||||||
|
|
||||||
|
def print_menu():
|
||||||
|
print('=' * 60)
|
||||||
|
print('综合工具集合')
|
||||||
|
print('=' * 60)
|
||||||
|
print('1. 网络爬虫工具')
|
||||||
|
print('2. 系统管理工具')
|
||||||
|
print('0. 退出')
|
||||||
|
print('=' * 60)
|
||||||
|
|
||||||
|
def spider_menu():
|
||||||
|
spider = WebSpider()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print('\n网络爬虫工具')
|
||||||
|
print('1. 开始爬取')
|
||||||
|
print('2. 设置代理')
|
||||||
|
print('3. 设置爬取深度')
|
||||||
|
print('0. 返回主菜单')
|
||||||
|
|
||||||
|
choice = input('请选择: ')
|
||||||
|
|
||||||
|
if choice == '1':
|
||||||
|
url = input('请输入起始URL: ')
|
||||||
|
rules_input = input('是否设置爬取规则? (y/n): ')
|
||||||
|
|
||||||
|
rules = None
|
||||||
|
if rules_input.lower() == 'y':
|
||||||
|
rules = []
|
||||||
|
while True:
|
||||||
|
selector = input('请输入CSS选择器: ')
|
||||||
|
extract_input = input('是否设置提取规则? (y/n): ')
|
||||||
|
|
||||||
|
extract = {}
|
||||||
|
if extract_input.lower() == 'y':
|
||||||
|
while True:
|
||||||
|
key = input('请输入字段名: ')
|
||||||
|
extractor = input('请输入提取方式 (text 或 attr:属性名): ')
|
||||||
|
extract[key] = extractor
|
||||||
|
|
||||||
|
more = input('是否添加更多提取规则? (y/n): ')
|
||||||
|
if more.lower() != 'y':
|
||||||
|
break
|
||||||
|
|
||||||
|
rules.append({'selector': selector, 'extract': extract})
|
||||||
|
|
||||||
|
more_rule = input('是否添加更多爬取规则? (y/n): ')
|
||||||
|
if more_rule.lower() != 'y':
|
||||||
|
break
|
||||||
|
|
||||||
|
data = spider.crawl(url, rules)
|
||||||
|
|
||||||
|
if data:
|
||||||
|
save_input = input('是否保存数据? (y/n): ')
|
||||||
|
if save_input.lower() == 'y':
|
||||||
|
format = input('保存格式 (json/csv): ')
|
||||||
|
filename = input('请输入文件名: ')
|
||||||
|
|
||||||
|
if format.lower() == 'json':
|
||||||
|
spider.save_to_json(data, filename + '.json')
|
||||||
|
print(f'数据已保存到 {filename}.json')
|
||||||
|
elif format.lower() == 'csv':
|
||||||
|
spider.save_to_csv(data, filename + '.csv')
|
||||||
|
print(f'数据已保存到 {filename}.csv')
|
||||||
|
|
||||||
|
elif choice == '2':
|
||||||
|
proxy = input('请输入代理地址 (格式: http://ip:port): ')
|
||||||
|
spider.set_proxies({'http': proxy, 'https': proxy})
|
||||||
|
print('代理设置成功')
|
||||||
|
|
||||||
|
elif choice == '3':
|
||||||
|
depth = int(input('请输入爬取深度: '))
|
||||||
|
spider.set_max_depth(depth)
|
||||||
|
print('爬取深度设置成功')
|
||||||
|
|
||||||
|
elif choice == '0':
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('无效选择,请重新输入')
|
||||||
|
|
||||||
|
def system_tools_menu():
|
||||||
|
system_tools = SystemTools()
|
||||||
|
|
||||||
|
while True:
|
||||||
|
print('\n系统管理工具')
|
||||||
|
print('1. 批量删除文本文件')
|
||||||
|
print('2. 删除指定文件')
|
||||||
|
print('3. 关闭系统')
|
||||||
|
print('4. 重启系统')
|
||||||
|
print('0. 返回主菜单')
|
||||||
|
|
||||||
|
choice = input('请选择: ')
|
||||||
|
|
||||||
|
if choice == '1':
|
||||||
|
directory = input('请输入目录路径: ')
|
||||||
|
force = input('是否强制删除 (y/n): ').lower() == 'y'
|
||||||
|
result = system_tools.batch_delete_text_files(directory, force)
|
||||||
|
print(f'成功删除: {len(result["deleted"])} 个文件')
|
||||||
|
print(f'失败: {len(result["failed"])} 个文件')
|
||||||
|
if result["failed"]:
|
||||||
|
print('失败列表:')
|
||||||
|
for file, error in result["failed"]:
|
||||||
|
print(f' - {file}: {error}')
|
||||||
|
|
||||||
|
elif choice == '2':
|
||||||
|
files = input('请输入文件路径,多个文件用逗号分隔: ').split(',')
|
||||||
|
files = [f.strip() for f in files]
|
||||||
|
force = input('是否强制删除 (y/n): ').lower() == 'y'
|
||||||
|
result = system_tools.delete_files(files, force)
|
||||||
|
print(f'成功删除: {len(result["deleted"])} 个文件')
|
||||||
|
print(f'失败: {len(result["failed"])} 个文件')
|
||||||
|
if result["failed"]:
|
||||||
|
print('失败列表:')
|
||||||
|
for file, error in result["failed"]:
|
||||||
|
print(f' - {file}: {error}')
|
||||||
|
|
||||||
|
elif choice == '3':
|
||||||
|
timeout = int(input('请输入延迟时间 (秒,0表示立即): '))
|
||||||
|
force = input('是否强制关闭 (y/n): ').lower() == 'y'
|
||||||
|
system_tools.shutdown_system(force, timeout)
|
||||||
|
|
||||||
|
elif choice == '4':
|
||||||
|
timeout = int(input('请输入延迟时间 (秒,0表示立即): '))
|
||||||
|
force = input('是否强制重启 (y/n): ').lower() == 'y'
|
||||||
|
system_tools.restart_system(force, timeout)
|
||||||
|
|
||||||
|
elif choice == '0':
|
||||||
|
break
|
||||||
|
|
||||||
|
else:
|
||||||
|
print('无效选择,请重新输入')
|
||||||
|
|
||||||
|
def main():
|
||||||
|
while True:
|
||||||
|
print_menu()
|
||||||
|
choice = input('请选择: ')
|
||||||
|
|
||||||
|
if choice == '1':
|
||||||
|
spider_menu()
|
||||||
|
elif choice == '2':
|
||||||
|
system_tools_menu()
|
||||||
|
elif choice == '0':
|
||||||
|
print('感谢使用,再见!')
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
print('无效选择,请重新输入')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
81
comprehensive-tools/spider.py
Normal file
81
comprehensive-tools/spider.py
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import json
|
||||||
|
import csv
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
class WebSpider:
|
||||||
|
def __init__(self):
|
||||||
|
self.headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||||
|
}
|
||||||
|
self.proxies = {}
|
||||||
|
self.visited_urls = set()
|
||||||
|
self.max_depth = 2
|
||||||
|
|
||||||
|
def set_proxies(self, proxies):
|
||||||
|
self.proxies = proxies
|
||||||
|
|
||||||
|
def set_max_depth(self, depth):
|
||||||
|
self.max_depth = depth
|
||||||
|
|
||||||
|
def crawl(self, start_url, rules=None, depth=0):
|
||||||
|
if depth > self.max_depth or start_url in self.visited_urls:
|
||||||
|
return []
|
||||||
|
|
||||||
|
self.visited_urls.add(start_url)
|
||||||
|
print(f'Crawling: {start_url}')
|
||||||
|
|
||||||
|
try:
|
||||||
|
time.sleep(random.uniform(1, 3)) # 反爬机制
|
||||||
|
response = requests.get(start_url, headers=self.headers, proxies=self.proxies, timeout=10, verify=False) # 禁用SSL验证
|
||||||
|
response.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Error crawling {start_url}: {e}')
|
||||||
|
return []
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
data = []
|
||||||
|
|
||||||
|
if rules:
|
||||||
|
for rule in rules:
|
||||||
|
elements = soup.select(rule['selector'])
|
||||||
|
for element in elements:
|
||||||
|
item = {}
|
||||||
|
if 'extract' in rule:
|
||||||
|
for key, extractor in rule['extract'].items():
|
||||||
|
if extractor == 'text':
|
||||||
|
item[key] = element.get_text(strip=True)
|
||||||
|
elif extractor.startswith('attr:'):
|
||||||
|
attr = extractor.split(':', 1)[1]
|
||||||
|
item[key] = element.get(attr, '')
|
||||||
|
data.append(item)
|
||||||
|
|
||||||
|
links = []
|
||||||
|
for a in soup.find_all('a', href=True):
|
||||||
|
href = a['href']
|
||||||
|
absolute_url = urljoin(start_url, href)
|
||||||
|
parsed_url = urlparse(absolute_url)
|
||||||
|
if parsed_url.scheme in ['http', 'https']:
|
||||||
|
links.append(absolute_url)
|
||||||
|
|
||||||
|
for link in links[:10]: # 限制爬取链接数量
|
||||||
|
data.extend(self.crawl(link, rules, depth + 1))
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def save_to_json(self, data, filename):
|
||||||
|
with open(filename, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
def save_to_csv(self, data, filename):
|
||||||
|
if not data:
|
||||||
|
return
|
||||||
|
|
||||||
|
keys = data[0].keys()
|
||||||
|
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
||||||
|
writer = csv.DictWriter(f, fieldnames=keys)
|
||||||
|
writer.writeheader()
|
||||||
|
writer.writerows(data)
|
||||||
97
comprehensive-tools/system_tools.py
Normal file
97
comprehensive-tools/system_tools.py
Normal file
|
|
@ -0,0 +1,97 @@
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
import ctypes
|
||||||
|
|
||||||
|
class SystemTools:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def delete_files(self, file_paths, force=False):
|
||||||
|
"""删除文件列表"""
|
||||||
|
deleted = []
|
||||||
|
failed = []
|
||||||
|
|
||||||
|
for file_path in file_paths:
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
failed.append((file_path, 'File not found'))
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not force:
|
||||||
|
confirm = input(f'Are you sure you want to delete {file_path}? (y/n): ')
|
||||||
|
if confirm.lower() != 'y':
|
||||||
|
failed.append((file_path, 'User cancelled'))
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.path.isdir(file_path):
|
||||||
|
shutil.rmtree(file_path)
|
||||||
|
else:
|
||||||
|
os.remove(file_path)
|
||||||
|
deleted.append(file_path)
|
||||||
|
print(f'Deleted: {file_path}')
|
||||||
|
except Exception as e:
|
||||||
|
failed.append((file_path, str(e)))
|
||||||
|
print(f'Failed to delete {file_path}: {e}')
|
||||||
|
|
||||||
|
return {'deleted': deleted, 'failed': failed}
|
||||||
|
|
||||||
|
def batch_delete_text_files(self, directory, force=False):
|
||||||
|
"""批量删除目录中的文本文件"""
|
||||||
|
if not os.path.exists(directory):
|
||||||
|
print(f'Directory not found: {directory}')
|
||||||
|
return {'deleted': [], 'failed': []}
|
||||||
|
|
||||||
|
text_files = []
|
||||||
|
for root, dirs, files in os.walk(directory):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.txt'):
|
||||||
|
text_files.append(os.path.join(root, file))
|
||||||
|
|
||||||
|
print(f'Found {len(text_files)} text files to delete')
|
||||||
|
return self.delete_files(text_files, force)
|
||||||
|
|
||||||
|
def shutdown_system(self, force=False, timeout=0):
|
||||||
|
"""关闭系统"""
|
||||||
|
if not force:
|
||||||
|
confirm = input('Are you sure you want to shutdown the system? (y/n): ')
|
||||||
|
if confirm.lower() != 'y':
|
||||||
|
print('Shutdown cancelled')
|
||||||
|
return False
|
||||||
|
|
||||||
|
if timeout > 0:
|
||||||
|
print(f'System will shutdown in {timeout} seconds...')
|
||||||
|
time.sleep(timeout)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.name == 'nt': # Windows
|
||||||
|
subprocess.run(['shutdown', '/s', '/t', '0'], check=True)
|
||||||
|
else: # Unix-like
|
||||||
|
subprocess.run(['shutdown', '-h', 'now'], check=True)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Error shutting down system: {e}')
|
||||||
|
return False
|
||||||
|
|
||||||
|
def restart_system(self, force=False, timeout=0):
|
||||||
|
"""重启系统"""
|
||||||
|
if not force:
|
||||||
|
confirm = input('Are you sure you want to restart the system? (y/n): ')
|
||||||
|
if confirm.lower() != 'y':
|
||||||
|
print('Restart cancelled')
|
||||||
|
return False
|
||||||
|
|
||||||
|
if timeout > 0:
|
||||||
|
print(f'System will restart in {timeout} seconds...')
|
||||||
|
time.sleep(timeout)
|
||||||
|
|
||||||
|
try:
|
||||||
|
if os.name == 'nt': # Windows
|
||||||
|
subprocess.run(['shutdown', '/r', '/t', '0'], check=True)
|
||||||
|
else: # Unix-like
|
||||||
|
subprocess.run(['shutdown', '-r', 'now'], check=True)
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f'Error restarting system: {e}')
|
||||||
|
return False
|
||||||
48
comprehensive-tools/test.py
Normal file
48
comprehensive-tools/test.py
Normal file
|
|
@ -0,0 +1,48 @@
|
||||||
|
from spider import WebSpider
|
||||||
|
from system_tools import SystemTools
|
||||||
|
|
||||||
|
# 测试网络爬虫
|
||||||
|
def test_spider():
|
||||||
|
print('测试网络爬虫...')
|
||||||
|
spider = WebSpider()
|
||||||
|
spider.set_max_depth(1)
|
||||||
|
|
||||||
|
# 测试简单爬取
|
||||||
|
url = 'https://example.com'
|
||||||
|
data = spider.crawl(url)
|
||||||
|
print(f'爬取到 {len(data)} 条数据')
|
||||||
|
|
||||||
|
# 测试带规则的爬取
|
||||||
|
rules = [{
|
||||||
|
'selector': 'a',
|
||||||
|
'extract': {
|
||||||
|
'text': 'text',
|
||||||
|
'href': 'attr:href'
|
||||||
|
}
|
||||||
|
}]
|
||||||
|
data_with_rules = spider.crawl(url, rules)
|
||||||
|
print(f'带规则爬取到 {len(data_with_rules)} 条数据')
|
||||||
|
|
||||||
|
# 测试保存功能
|
||||||
|
if data_with_rules:
|
||||||
|
spider.save_to_json(data_with_rules, 'test_spider.json')
|
||||||
|
spider.save_to_csv(data_with_rules, 'test_spider.csv')
|
||||||
|
print('数据已保存到 test_spider.json 和 test_spider.csv')
|
||||||
|
|
||||||
|
# 测试系统管理工具
|
||||||
|
def test_system_tools():
|
||||||
|
print('\n测试系统管理工具...')
|
||||||
|
system_tools = SystemTools()
|
||||||
|
|
||||||
|
# 创建测试文件
|
||||||
|
with open('test_file.txt', 'w') as f:
|
||||||
|
f.write('测试文件')
|
||||||
|
|
||||||
|
# 测试删除文件
|
||||||
|
result = system_tools.delete_files(['test_file.txt'], force=True)
|
||||||
|
print(f'删除文件结果: {result}')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
test_spider()
|
||||||
|
test_system_tools()
|
||||||
|
print('\n测试完成!')
|
||||||
Loading…
Reference in a new issue