mirror of
https://github.com/codecrafters-io/build-your-own-x
synced 2026-07-02 16:59:25 +00:00
Merge acb43dadc5 into 294aef8fde
This commit is contained in:
commit
36813c1608
8 changed files with 446 additions and 0 deletions
68
comprehensive-tools/README.md
Normal file
68
comprehensive-tools/README.md
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
# 综合工具集合
|
||||
|
||||
本项目提供了一个综合性的工具集合,包含两个主要功能模块:网络爬虫工具和系统管理工具。
|
||||
|
||||
## 功能模块
|
||||
|
||||
### 1. 网络爬虫工具
|
||||
- 支持自定义URL输入和爬取规则配置
|
||||
- 包含数据提取、存储和导出功能
|
||||
- 实现基本的反爬机制和错误处理
|
||||
- 支持JSON和CSV格式数据导出
|
||||
|
||||
### 2. 系统管理工具
|
||||
- 批处理文本文件自动强制删除功能
|
||||
- 文件强制删除的安全确认机制
|
||||
- 系统强制关机功能及定时关机选项
|
||||
- 系统重启功能及定时重启选项
|
||||
|
||||
## 安装依赖
|
||||
|
||||
```bash
|
||||
pip install requests beautifulsoup4
|
||||
```
|
||||
|
||||
## 使用方法
|
||||
|
||||
1. 运行主程序
|
||||
|
||||
```bash
|
||||
python main.py
|
||||
```
|
||||
|
||||
2. 选择功能模块
|
||||
|
||||
### 网络爬虫工具使用示例
|
||||
|
||||
1. 选择"1. 网络爬虫工具"
|
||||
2. 选择"1. 开始爬取"
|
||||
3. 输入起始URL,例如:`https://example.com`
|
||||
4. 根据提示设置爬取规则(可选)
|
||||
5. 爬取完成后选择是否保存数据及保存格式
|
||||
|
||||
### 系统管理工具使用示例
|
||||
|
||||
1. 选择"2. 系统管理工具"
|
||||
2. 选择相应的功能:
|
||||
- "1. 批量删除文本文件":删除指定目录下的所有文本文件
|
||||
- "2. 删除指定文件":删除用户指定的文件
|
||||
- "3. 关闭系统":关闭计算机
|
||||
- "4. 重启系统":重启计算机
|
||||
|
||||
## 注意事项
|
||||
|
||||
- 网络爬虫工具遵循robots.txt规则,请勿用于非法爬取
|
||||
- 系统管理工具的关机和重启功能需要管理员权限
|
||||
- 批量删除文件时请谨慎操作,建议先备份重要数据
|
||||
|
||||
## 代码结构
|
||||
|
||||
- `main.py`:主程序,提供用户界面
|
||||
- `spider.py`:网络爬虫模块
|
||||
- `system_tools.py`:系统管理工具模块
|
||||
- `test.py`:测试文件
|
||||
- `README.md`:使用说明
|
||||
|
||||
## 贡献
|
||||
|
||||
欢迎提交问题和改进建议。
|
||||
BIN
comprehensive-tools/__pycache__/main.cpython-314.pyc
Normal file
BIN
comprehensive-tools/__pycache__/main.cpython-314.pyc
Normal file
Binary file not shown.
BIN
comprehensive-tools/__pycache__/spider.cpython-314.pyc
Normal file
BIN
comprehensive-tools/__pycache__/spider.cpython-314.pyc
Normal file
Binary file not shown.
BIN
comprehensive-tools/__pycache__/system_tools.cpython-314.pyc
Normal file
BIN
comprehensive-tools/__pycache__/system_tools.cpython-314.pyc
Normal file
Binary file not shown.
152
comprehensive-tools/main.py
Normal file
152
comprehensive-tools/main.py
Normal file
|
|
@ -0,0 +1,152 @@
|
|||
from spider import WebSpider
|
||||
from system_tools import SystemTools
|
||||
|
||||
def print_menu():
|
||||
print('=' * 60)
|
||||
print('综合工具集合')
|
||||
print('=' * 60)
|
||||
print('1. 网络爬虫工具')
|
||||
print('2. 系统管理工具')
|
||||
print('0. 退出')
|
||||
print('=' * 60)
|
||||
|
||||
def spider_menu():
|
||||
spider = WebSpider()
|
||||
|
||||
while True:
|
||||
print('\n网络爬虫工具')
|
||||
print('1. 开始爬取')
|
||||
print('2. 设置代理')
|
||||
print('3. 设置爬取深度')
|
||||
print('0. 返回主菜单')
|
||||
|
||||
choice = input('请选择: ')
|
||||
|
||||
if choice == '1':
|
||||
url = input('请输入起始URL: ')
|
||||
rules_input = input('是否设置爬取规则? (y/n): ')
|
||||
|
||||
rules = None
|
||||
if rules_input.lower() == 'y':
|
||||
rules = []
|
||||
while True:
|
||||
selector = input('请输入CSS选择器: ')
|
||||
extract_input = input('是否设置提取规则? (y/n): ')
|
||||
|
||||
extract = {}
|
||||
if extract_input.lower() == 'y':
|
||||
while True:
|
||||
key = input('请输入字段名: ')
|
||||
extractor = input('请输入提取方式 (text 或 attr:属性名): ')
|
||||
extract[key] = extractor
|
||||
|
||||
more = input('是否添加更多提取规则? (y/n): ')
|
||||
if more.lower() != 'y':
|
||||
break
|
||||
|
||||
rules.append({'selector': selector, 'extract': extract})
|
||||
|
||||
more_rule = input('是否添加更多爬取规则? (y/n): ')
|
||||
if more_rule.lower() != 'y':
|
||||
break
|
||||
|
||||
data = spider.crawl(url, rules)
|
||||
|
||||
if data:
|
||||
save_input = input('是否保存数据? (y/n): ')
|
||||
if save_input.lower() == 'y':
|
||||
format = input('保存格式 (json/csv): ')
|
||||
filename = input('请输入文件名: ')
|
||||
|
||||
if format.lower() == 'json':
|
||||
spider.save_to_json(data, filename + '.json')
|
||||
print(f'数据已保存到 {filename}.json')
|
||||
elif format.lower() == 'csv':
|
||||
spider.save_to_csv(data, filename + '.csv')
|
||||
print(f'数据已保存到 {filename}.csv')
|
||||
|
||||
elif choice == '2':
|
||||
proxy = input('请输入代理地址 (格式: http://ip:port): ')
|
||||
spider.set_proxies({'http': proxy, 'https': proxy})
|
||||
print('代理设置成功')
|
||||
|
||||
elif choice == '3':
|
||||
depth = int(input('请输入爬取深度: '))
|
||||
spider.set_max_depth(depth)
|
||||
print('爬取深度设置成功')
|
||||
|
||||
elif choice == '0':
|
||||
break
|
||||
|
||||
else:
|
||||
print('无效选择,请重新输入')
|
||||
|
||||
def system_tools_menu():
|
||||
system_tools = SystemTools()
|
||||
|
||||
while True:
|
||||
print('\n系统管理工具')
|
||||
print('1. 批量删除文本文件')
|
||||
print('2. 删除指定文件')
|
||||
print('3. 关闭系统')
|
||||
print('4. 重启系统')
|
||||
print('0. 返回主菜单')
|
||||
|
||||
choice = input('请选择: ')
|
||||
|
||||
if choice == '1':
|
||||
directory = input('请输入目录路径: ')
|
||||
force = input('是否强制删除 (y/n): ').lower() == 'y'
|
||||
result = system_tools.batch_delete_text_files(directory, force)
|
||||
print(f'成功删除: {len(result["deleted"])} 个文件')
|
||||
print(f'失败: {len(result["failed"])} 个文件')
|
||||
if result["failed"]:
|
||||
print('失败列表:')
|
||||
for file, error in result["failed"]:
|
||||
print(f' - {file}: {error}')
|
||||
|
||||
elif choice == '2':
|
||||
files = input('请输入文件路径,多个文件用逗号分隔: ').split(',')
|
||||
files = [f.strip() for f in files]
|
||||
force = input('是否强制删除 (y/n): ').lower() == 'y'
|
||||
result = system_tools.delete_files(files, force)
|
||||
print(f'成功删除: {len(result["deleted"])} 个文件')
|
||||
print(f'失败: {len(result["failed"])} 个文件')
|
||||
if result["failed"]:
|
||||
print('失败列表:')
|
||||
for file, error in result["failed"]:
|
||||
print(f' - {file}: {error}')
|
||||
|
||||
elif choice == '3':
|
||||
timeout = int(input('请输入延迟时间 (秒,0表示立即): '))
|
||||
force = input('是否强制关闭 (y/n): ').lower() == 'y'
|
||||
system_tools.shutdown_system(force, timeout)
|
||||
|
||||
elif choice == '4':
|
||||
timeout = int(input('请输入延迟时间 (秒,0表示立即): '))
|
||||
force = input('是否强制重启 (y/n): ').lower() == 'y'
|
||||
system_tools.restart_system(force, timeout)
|
||||
|
||||
elif choice == '0':
|
||||
break
|
||||
|
||||
else:
|
||||
print('无效选择,请重新输入')
|
||||
|
||||
def main():
|
||||
while True:
|
||||
print_menu()
|
||||
choice = input('请选择: ')
|
||||
|
||||
if choice == '1':
|
||||
spider_menu()
|
||||
elif choice == '2':
|
||||
system_tools_menu()
|
||||
elif choice == '0':
|
||||
print('感谢使用,再见!')
|
||||
break
|
||||
else:
|
||||
print('无效选择,请重新输入')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
81
comprehensive-tools/spider.py
Normal file
81
comprehensive-tools/spider.py
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import csv
|
||||
import time
|
||||
import random
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
class WebSpider:
|
||||
def __init__(self):
|
||||
self.headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
||||
}
|
||||
self.proxies = {}
|
||||
self.visited_urls = set()
|
||||
self.max_depth = 2
|
||||
|
||||
def set_proxies(self, proxies):
|
||||
self.proxies = proxies
|
||||
|
||||
def set_max_depth(self, depth):
|
||||
self.max_depth = depth
|
||||
|
||||
def crawl(self, start_url, rules=None, depth=0):
|
||||
if depth > self.max_depth or start_url in self.visited_urls:
|
||||
return []
|
||||
|
||||
self.visited_urls.add(start_url)
|
||||
print(f'Crawling: {start_url}')
|
||||
|
||||
try:
|
||||
time.sleep(random.uniform(1, 3)) # 反爬机制
|
||||
response = requests.get(start_url, headers=self.headers, proxies=self.proxies, timeout=10, verify=False) # 禁用SSL验证
|
||||
response.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f'Error crawling {start_url}: {e}')
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
data = []
|
||||
|
||||
if rules:
|
||||
for rule in rules:
|
||||
elements = soup.select(rule['selector'])
|
||||
for element in elements:
|
||||
item = {}
|
||||
if 'extract' in rule:
|
||||
for key, extractor in rule['extract'].items():
|
||||
if extractor == 'text':
|
||||
item[key] = element.get_text(strip=True)
|
||||
elif extractor.startswith('attr:'):
|
||||
attr = extractor.split(':', 1)[1]
|
||||
item[key] = element.get(attr, '')
|
||||
data.append(item)
|
||||
|
||||
links = []
|
||||
for a in soup.find_all('a', href=True):
|
||||
href = a['href']
|
||||
absolute_url = urljoin(start_url, href)
|
||||
parsed_url = urlparse(absolute_url)
|
||||
if parsed_url.scheme in ['http', 'https']:
|
||||
links.append(absolute_url)
|
||||
|
||||
for link in links[:10]: # 限制爬取链接数量
|
||||
data.extend(self.crawl(link, rules, depth + 1))
|
||||
|
||||
return data
|
||||
|
||||
def save_to_json(self, data, filename):
|
||||
with open(filename, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def save_to_csv(self, data, filename):
|
||||
if not data:
|
||||
return
|
||||
|
||||
keys = data[0].keys()
|
||||
with open(filename, 'w', newline='', encoding='utf-8') as f:
|
||||
writer = csv.DictWriter(f, fieldnames=keys)
|
||||
writer.writeheader()
|
||||
writer.writerows(data)
|
||||
97
comprehensive-tools/system_tools.py
Normal file
97
comprehensive-tools/system_tools.py
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import ctypes
|
||||
|
||||
class SystemTools:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def delete_files(self, file_paths, force=False):
|
||||
"""删除文件列表"""
|
||||
deleted = []
|
||||
failed = []
|
||||
|
||||
for file_path in file_paths:
|
||||
if not os.path.exists(file_path):
|
||||
failed.append((file_path, 'File not found'))
|
||||
continue
|
||||
|
||||
if not force:
|
||||
confirm = input(f'Are you sure you want to delete {file_path}? (y/n): ')
|
||||
if confirm.lower() != 'y':
|
||||
failed.append((file_path, 'User cancelled'))
|
||||
continue
|
||||
|
||||
try:
|
||||
if os.path.isdir(file_path):
|
||||
shutil.rmtree(file_path)
|
||||
else:
|
||||
os.remove(file_path)
|
||||
deleted.append(file_path)
|
||||
print(f'Deleted: {file_path}')
|
||||
except Exception as e:
|
||||
failed.append((file_path, str(e)))
|
||||
print(f'Failed to delete {file_path}: {e}')
|
||||
|
||||
return {'deleted': deleted, 'failed': failed}
|
||||
|
||||
def batch_delete_text_files(self, directory, force=False):
|
||||
"""批量删除目录中的文本文件"""
|
||||
if not os.path.exists(directory):
|
||||
print(f'Directory not found: {directory}')
|
||||
return {'deleted': [], 'failed': []}
|
||||
|
||||
text_files = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
for file in files:
|
||||
if file.endswith('.txt'):
|
||||
text_files.append(os.path.join(root, file))
|
||||
|
||||
print(f'Found {len(text_files)} text files to delete')
|
||||
return self.delete_files(text_files, force)
|
||||
|
||||
def shutdown_system(self, force=False, timeout=0):
|
||||
"""关闭系统"""
|
||||
if not force:
|
||||
confirm = input('Are you sure you want to shutdown the system? (y/n): ')
|
||||
if confirm.lower() != 'y':
|
||||
print('Shutdown cancelled')
|
||||
return False
|
||||
|
||||
if timeout > 0:
|
||||
print(f'System will shutdown in {timeout} seconds...')
|
||||
time.sleep(timeout)
|
||||
|
||||
try:
|
||||
if os.name == 'nt': # Windows
|
||||
subprocess.run(['shutdown', '/s', '/t', '0'], check=True)
|
||||
else: # Unix-like
|
||||
subprocess.run(['shutdown', '-h', 'now'], check=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f'Error shutting down system: {e}')
|
||||
return False
|
||||
|
||||
def restart_system(self, force=False, timeout=0):
|
||||
"""重启系统"""
|
||||
if not force:
|
||||
confirm = input('Are you sure you want to restart the system? (y/n): ')
|
||||
if confirm.lower() != 'y':
|
||||
print('Restart cancelled')
|
||||
return False
|
||||
|
||||
if timeout > 0:
|
||||
print(f'System will restart in {timeout} seconds...')
|
||||
time.sleep(timeout)
|
||||
|
||||
try:
|
||||
if os.name == 'nt': # Windows
|
||||
subprocess.run(['shutdown', '/r', '/t', '0'], check=True)
|
||||
else: # Unix-like
|
||||
subprocess.run(['shutdown', '-r', 'now'], check=True)
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f'Error restarting system: {e}')
|
||||
return False
|
||||
48
comprehensive-tools/test.py
Normal file
48
comprehensive-tools/test.py
Normal file
|
|
@ -0,0 +1,48 @@
|
|||
from spider import WebSpider
|
||||
from system_tools import SystemTools
|
||||
|
||||
# 测试网络爬虫
|
||||
def test_spider():
|
||||
print('测试网络爬虫...')
|
||||
spider = WebSpider()
|
||||
spider.set_max_depth(1)
|
||||
|
||||
# 测试简单爬取
|
||||
url = 'https://example.com'
|
||||
data = spider.crawl(url)
|
||||
print(f'爬取到 {len(data)} 条数据')
|
||||
|
||||
# 测试带规则的爬取
|
||||
rules = [{
|
||||
'selector': 'a',
|
||||
'extract': {
|
||||
'text': 'text',
|
||||
'href': 'attr:href'
|
||||
}
|
||||
}]
|
||||
data_with_rules = spider.crawl(url, rules)
|
||||
print(f'带规则爬取到 {len(data_with_rules)} 条数据')
|
||||
|
||||
# 测试保存功能
|
||||
if data_with_rules:
|
||||
spider.save_to_json(data_with_rules, 'test_spider.json')
|
||||
spider.save_to_csv(data_with_rules, 'test_spider.csv')
|
||||
print('数据已保存到 test_spider.json 和 test_spider.csv')
|
||||
|
||||
# 测试系统管理工具
|
||||
def test_system_tools():
|
||||
print('\n测试系统管理工具...')
|
||||
system_tools = SystemTools()
|
||||
|
||||
# 创建测试文件
|
||||
with open('test_file.txt', 'w') as f:
|
||||
f.write('测试文件')
|
||||
|
||||
# 测试删除文件
|
||||
result = system_tools.delete_files(['test_file.txt'], force=True)
|
||||
print(f'删除文件结果: {result}')
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_spider()
|
||||
test_system_tools()
|
||||
print('\n测试完成!')
|
||||
Loading…
Reference in a new issue