diff --git a/README.md b/README.md index 353f7fa..dbf2d5f 100644 --- a/README.md +++ b/README.md @@ -11,41 +11,39 @@ ___________________ _________ \/ \/ ``` -## What's New in v3.0 šŸŽ‰ +## What's New in v3.1 šŸŽ‰ -**QR Code Authentication:** -- **No phone number required** - Login with QR code scanning (still need API credentials) -- **Faster authentication** - Just scan with your phone after API setup -- **Secure login** - Recommended authentication method -- **2FA support** for both QR and phone methods +**Enhanced Message Data:** +- **Message statistics** - Captures views, forwards, and post_author for each message +- **Reactions support** - Records all emoji reactions with counts (e.g., "šŸ˜€ 12 šŸ‘ 3") +- **Automatic database migration** - Seamlessly adds new columns to existing databases +- **Richer exports** - All new data included in CSV/JSON exports -**Enhanced User Experience:** -- **Numbered channel selection** - Use 1,2,3 instead of full channel IDs -- **Multi-channel operations** - Add, remove, and scrape multiple channels at once -- **Streamlined menu** - Cleaner interface with fewer redundant options -- **Progress bars** for media downloads with visual feedback +**Improved Channel Management:** +- **Channel names displayed** - Shows channel names alongside IDs everywhere +- **Smart filtering** - List option now only shows Channels and Groups (no private chats) +- **channels_list.csv export** - Automatically saves channel list with names, IDs, usernames, and types +- **"all" selection** - Quickly add all listed channels at once +- **Better export naming** - Files now named as `ID_username.csv` and `ID_username.json` -**Media Download Improvements:** -- **Fixed file overwriting** - Unique naming prevents media files from being overwritten -- **5x concurrent downloads** - Increased from 3 to 5 for faster media processing -- **Better error handling** - Improved retry logic and recovery - -**Performance & Stability:** -- **Database optimizations** - WAL mode and faster operations -- **Hidden warnings** - Cleaner output without technical messages -- **Better error recovery** - More robust handling of network issues +**Bug Fixes:** +- **Fixed channel ID parsing** - Resolved "invalid literal for int()" error in fix missing media +- **Better entity resolution** - Handles both numeric IDs and channel usernames +- **Improved error messages** - Shows channel names with IDs for clearer debugging ## Features šŸš€ - **QR Code & Phone Authentication** - Choose your preferred login method -- Scrape messages from multiple Telegram channels +- Scrape messages with full metadata (views, forwards, reactions, post author) - Download media files with parallel processing and unique naming - Real-time continuous scraping -- Export data to JSON and CSV formats -- SQLite database storage with optimized performance +- Export data to JSON and CSV formats with enhanced metadata +- SQLite database storage with automatic schema migration - Resume capability (saves progress) -- Interactive menu with numbered channel selection +- Interactive menu with channel names and numbered selection +- Smart channel filtering (only shows channels/groups) - Progress tracking with visual progress bars +- Automatic channels list export to CSV ## Prerequisites šŸ“‹ @@ -128,16 +126,23 @@ Instead of typing long channel IDs, use numbers: **Adding Channels:** ``` -[1] The News (Chat) (id: -1002116176890) -[2] Python Channel (id: -1001597139842) -[3] The Corner (id: -1002274713954) +[1] Tech News (ID: -1002116176890, Type: Channel, Username: @technews) +[2] Python Dev (ID: -1001597139842, Type: Group, Username: @pythondev) +[3] Daily Updates (ID: -1002274713954, Type: Channel, Username: @dailyupdates) Enter: 1,3 (adds channels 1 and 3) +Or: all (adds all listed channels) +``` + +**Viewing Your Channels:** +``` +[1] Tech News (ID: -1002116176890), Last Message ID: 5234, Messages: 12450 +[2] Python Dev (ID: -1001597139842), Last Message ID: 8192, Messages: 45782 ``` **Scraping Channels:** - Single: `1` -- Multiple: `1,3,5` +- Multiple: `1,3,5` - All: `all` - Mix formats: `1,-1001597139842,3` @@ -149,6 +154,8 @@ Data is stored in SQLite databases, one per channel: - Location: `./channelname/channelname.db` - Optimized with indexes for fast queries - WAL mode for better performance +- Schema includes: message_id, date, sender info, message text, media info, reply_to, post_author, views, forwards, reactions +- Automatic migration adds new columns to existing databases ### Media Storage šŸ“ @@ -160,8 +167,11 @@ Media files are stored with unique naming: ### Exported Data šŸ“Š Export formats: -1. **CSV**: `./channelname/channelname.csv` -2. **JSON**: `./channelname/channelname.json` +1. **CSV**: `./channelname/channelid_username.csv` +2. **JSON**: `./channelname/channelid_username.json` +3. **Channel List**: `./channels_list.csv` (automatically created when using [L] option) + +All exports include complete message metadata: views, forwards, reactions, and post author information. ## Performance Features āš™ļø diff --git a/telegram-scraper.py b/telegram-scraper.py index 900910a..5b5863d 100644 --- a/telegram-scraper.py +++ b/telegram-scraper.py @@ -12,7 +12,7 @@ from typing import Dict, List, Optional, Any from pathlib import Path from io import StringIO from telethon import TelegramClient -from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument, MessageMediaWebPage, User, PeerChannel +from telethon.tl.types import MessageMediaPhoto, MessageMediaDocument, MessageMediaWebPage, User, PeerChannel, Channel, Chat from telethon.errors import FloodWaitError, SessionPasswordNeededError import qrcode @@ -43,6 +43,10 @@ class MessageData: media_type: Optional[str] media_path: Optional[str] reply_to: Optional[int] + post_author: Optional[str] + views: Optional[int] + forwards: Optional[int] + reactions: Optional[str] class OptimizedTelegramScraper: def __init__(self): @@ -66,6 +70,7 @@ class OptimizedTelegramScraper: 'api_id': None, 'api_hash': None, 'channels': {}, + 'channel_names': {}, 'scrape_media': True, } @@ -80,22 +85,50 @@ class OptimizedTelegramScraper: if channel not in self.db_connections: channel_dir = Path(channel) channel_dir.mkdir(exist_ok=True) - + db_file = channel_dir / f'{channel}.db' conn = sqlite3.connect(str(db_file), check_same_thread=False) conn.execute('''CREATE TABLE IF NOT EXISTS messages - (id INTEGER PRIMARY KEY, message_id INTEGER UNIQUE, date TEXT, - sender_id INTEGER, first_name TEXT, last_name TEXT, username TEXT, - message TEXT, media_type TEXT, media_path TEXT, reply_to INTEGER)''') + (id INTEGER PRIMARY KEY, message_id INTEGER UNIQUE, date TEXT, + sender_id INTEGER, first_name TEXT, last_name TEXT, username TEXT, + message TEXT, media_type TEXT, media_path TEXT, reply_to INTEGER, + post_author TEXT, views INTEGER, forwards INTEGER, reactions TEXT)''') conn.execute('CREATE INDEX IF NOT EXISTS idx_message_id ON messages(message_id)') conn.execute('CREATE INDEX IF NOT EXISTS idx_date ON messages(date)') conn.execute('PRAGMA journal_mode=WAL') conn.execute('PRAGMA synchronous=NORMAL') conn.commit() + + self.migrate_database(conn) + self.db_connections[channel] = conn - + return self.db_connections[channel] + def migrate_database(self, conn: sqlite3.Connection): + cursor = conn.cursor() + cursor.execute("PRAGMA table_info(messages)") + columns = {row[1] for row in cursor.fetchall()} + + migrations = [] + if 'post_author' not in columns: + migrations.append('ALTER TABLE messages ADD COLUMN post_author TEXT') + if 'views' not in columns: + migrations.append('ALTER TABLE messages ADD COLUMN views INTEGER') + if 'forwards' not in columns: + migrations.append('ALTER TABLE messages ADD COLUMN forwards INTEGER') + if 'reactions' not in columns: + migrations.append('ALTER TABLE messages ADD COLUMN reactions TEXT') + + for migration in migrations: + try: + conn.execute(migration) + except: + pass + + if migrations: + conn.commit() + def close_db_connections(self): for conn in self.db_connections.values(): conn.close() @@ -104,16 +137,18 @@ class OptimizedTelegramScraper: def batch_insert_messages(self, channel: str, messages: List[MessageData]): if not messages: return - + conn = self.get_db_connection(channel) - data = [(msg.message_id, msg.date, msg.sender_id, msg.first_name, - msg.last_name, msg.username, msg.message, msg.media_type, - msg.media_path, msg.reply_to) for msg in messages] - - conn.executemany('''INSERT OR IGNORE INTO messages - (message_id, date, sender_id, first_name, last_name, username, - message, media_type, media_path, reply_to) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data) + data = [(msg.message_id, msg.date, msg.sender_id, msg.first_name, + msg.last_name, msg.username, msg.message, msg.media_type, + msg.media_path, msg.reply_to, msg.post_author, msg.views, + msg.forwards, msg.reactions) for msg in messages] + + conn.executemany('''INSERT OR IGNORE INTO messages + (message_id, date, sender_id, first_name, last_name, username, + message, media_type, media_path, reply_to, post_author, views, + forwards, reactions) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data) conn.commit() async def download_media(self, channel: str, message) -> Optional[str]: @@ -195,7 +230,18 @@ class OptimizedTelegramScraper: async for message in self.client.iter_messages(entity, offset_id=offset_id, reverse=True): try: sender = await message.get_sender() - + + reactions_str = None + if message.reactions and message.reactions.results: + reactions_parts = [] + for reaction in message.reactions.results: + emoji = getattr(reaction.reaction, 'emoticon', '') + count = reaction.count + if emoji: + reactions_parts.append(f"{emoji} {count}") + if reactions_parts: + reactions_str = ' '.join(reactions_parts) + msg_data = MessageData( message_id=message.id, date=message.date.strftime('%Y-%m-%d %H:%M:%S'), @@ -206,9 +252,13 @@ class OptimizedTelegramScraper: message=message.message or '', media_type=message.media.__class__.__name__ if message.media else None, media_path=None, - reply_to=message.reply_to_msg_id if message.reply_to else None + reply_to=message.reply_to_msg_id if message.reply_to else None, + post_author=message.post_author, + views=message.views, + forwards=message.forwards, + reactions=reactions_str ) - + message_batch.append(msg_data) if self.state['scrape_media'] and message.media and not isinstance(message.media, MessageMediaWebPage): @@ -289,14 +339,19 @@ class OptimizedTelegramScraper: cursor.execute('SELECT message_id FROM messages WHERE media_type IS NOT NULL AND media_type != "MessageMediaWebPage" AND media_path IS NULL') message_ids = [row[0] for row in cursor.fetchall()] + channel_name = self.state.get('channel_names', {}).get(channel, 'Unknown') + if not message_ids: - print(f"No media files to reprocess for channel {channel}") + print(f"No media files to reprocess for {channel_name} (ID: {channel})") return - print(f"šŸ“„ Reprocessing {len(message_ids)} media files for channel {channel}") + print(f"šŸ“„ Reprocessing {len(message_ids)} media files for {channel_name} (ID: {channel})") try: - entity = await self.client.get_entity(PeerChannel(int(channel))) + if channel.lstrip('-').isdigit(): + entity = await self.client.get_entity(PeerChannel(int(channel))) + else: + entity = await self.client.get_entity(channel) semaphore = asyncio.Semaphore(self.max_concurrent_downloads) completed_media = 0 successful_downloads = 0 @@ -339,35 +394,39 @@ class OptimizedTelegramScraper: async def fix_missing_media(self, channel: str): conn = self.get_db_connection(channel) cursor = conn.cursor() - + cursor.execute('SELECT COUNT(*) FROM messages WHERE media_type IS NOT NULL AND media_type != "MessageMediaWebPage"') total_with_media = cursor.fetchone()[0] - + cursor.execute('SELECT COUNT(*) FROM messages WHERE media_type IS NOT NULL AND media_type != "MessageMediaWebPage" AND media_path IS NOT NULL') total_with_files = cursor.fetchone()[0] - + missing_count = total_with_media - total_with_files - - print(f"\nšŸ“Š Media Analysis for {channel}:") + + channel_name = self.state.get('channel_names', {}).get(channel, 'Unknown') + print(f"\nšŸ“Š Media Analysis for {channel_name} (ID: {channel}):") print(f"Messages with media: {total_with_media}") print(f"Media files downloaded: {total_with_files}") print(f"Missing media files: {missing_count}") - + if missing_count == 0: print("āœ… All media files are already downloaded!") return - + cursor.execute('SELECT message_id, media_type FROM messages WHERE media_type IS NOT NULL AND media_type != "MessageMediaWebPage" AND (media_path IS NULL OR media_path = "")') missing_media = cursor.fetchall() - + if not missing_media: print("āœ… No missing media found!") return print(f"\nšŸ”§ Attempting to download {len(missing_media)} missing media files...") - + try: - entity = await self.client.get_entity(PeerChannel(int(channel))) + if channel.lstrip('-').isdigit(): + entity = await self.client.get_entity(PeerChannel(int(channel))) + else: + entity = await self.client.get_entity(channel) semaphore = asyncio.Semaphore(self.max_concurrent_downloads) completed_media = 0 successful_downloads = 0 @@ -432,18 +491,23 @@ class OptimizedTelegramScraper: finally: self.continuous_scraping_active = False + def get_export_filename(self, channel: str): + username = self.state.get('channel_names', {}).get(channel, 'no_username') + return f"{channel}_{username}" + def export_to_csv(self, channel: str): conn = self.get_db_connection(channel) - csv_file = Path(channel) / f'{channel}.csv' - + filename = self.get_export_filename(channel) + csv_file = Path(channel) / f'{filename}.csv' + cursor = conn.cursor() cursor.execute('SELECT * FROM messages ORDER BY date') columns = [description[0] for description in cursor.description] - + with open(csv_file, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(columns) - + while True: rows = cursor.fetchmany(1000) if not rows: @@ -452,30 +516,31 @@ class OptimizedTelegramScraper: def export_to_json(self, channel: str): conn = self.get_db_connection(channel) - json_file = Path(channel) / f'{channel}.json' - + filename = self.get_export_filename(channel) + json_file = Path(channel) / f'{filename}.json' + cursor = conn.cursor() cursor.execute('SELECT * FROM messages ORDER BY date') columns = [description[0] for description in cursor.description] - + with open(json_file, 'w', encoding='utf-8') as f: f.write('[\n') first_row = True - + while True: rows = cursor.fetchmany(1000) if not rows: break - + for row in rows: if not first_row: f.write(',\n') else: first_row = False - + data = dict(zip(columns, row)) json.dump(data, f, ensure_ascii=False, indent=2) - + f.write('\n]') async def export_data(self): @@ -496,7 +561,7 @@ class OptimizedTelegramScraper: if not self.state['channels']: print("No channels saved") return - + print("\nCurrent channels:") for i, (channel, last_id) in enumerate(self.state['channels'].items(), 1): try: @@ -504,20 +569,45 @@ class OptimizedTelegramScraper: cursor = conn.cursor() cursor.execute('SELECT COUNT(*) FROM messages') count = cursor.fetchone()[0] - print(f"[{i}] Channel ID: {channel}, Last Message ID: {last_id}, Messages: {count}") + channel_name = self.state.get('channel_names', {}).get(channel, 'Unknown') + print(f"[{i}] {channel_name} (ID: {channel}), Last Message ID: {last_id}, Messages: {count}") except: - print(f"[{i}] Channel ID: {channel}, Last Message ID: {last_id}") + channel_name = self.state.get('channel_names', {}).get(channel, 'Unknown') + print(f"[{i}] {channel_name} (ID: {channel}), Last Message ID: {last_id}") async def list_channels(self): try: - print("\nList of channels joined by account:") + print("\nList of channels and groups joined by account:") count = 1 + channels_data = [] async for dialog in self.client.iter_dialogs(): - if dialog.id != 777000: - print(f"[{count}] {dialog.title} (id: {dialog.id})") + entity = dialog.entity + if dialog.id != 777000 and (isinstance(entity, Channel) or isinstance(entity, Chat)): + channel_type = "Channel" if isinstance(entity, Channel) and entity.broadcast else "Group" + username = getattr(entity, 'username', None) or 'no_username' + print(f"[{count}] {dialog.title} (ID: {dialog.id}, Type: {channel_type}, Username: @{username})") + channels_data.append({ + 'number': count, + 'channel_name': dialog.title, + 'channel_id': str(dialog.id), + 'username': username, + 'type': channel_type + }) count += 1 + + if channels_data: + csv_file = Path('channels_list.csv') + with open(csv_file, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=['number', 'channel_name', 'channel_id', 'username', 'type']) + writer.writeheader() + writer.writerows(channels_data) + print(f"\nāœ… Saved channels list to {csv_file}") + + return channels_data + except Exception as e: print(f"Error listing channels: {e}") + return [] def display_qr_code_ascii(self, qr_login): qr = qrcode.QRCode(box_size=1, border=1) @@ -736,44 +826,66 @@ class OptimizedTelegramScraper: await self.export_data() elif choice == 'l': - channels_list = [] - async for dialog in self.client.iter_dialogs(): - if dialog.id != 777000: - channels_list.append(str(dialog.id)) - - await self.list_channels() + channels_data = await self.list_channels() + + if not channels_data: + continue + print("\nTo add channels from the list above:") print("• Single: 1 or -1001234567890") print("• Multiple: 1,3,5 or mix formats") + print("• All channels: all") print("• Press Enter to skip adding") selection = input("\nEnter selection (or Enter to skip): ").strip() - + if selection: added_count = 0 - for sel in [x.strip() for x in selection.split(',')]: - try: - if sel.startswith('-'): - channel = sel - else: - num = int(sel) - if 1 <= num <= len(channels_list): - channel = channels_list[num - 1] - else: - print(f"Invalid number: {num}. Choose 1-{len(channels_list)}") - continue - - if channel in self.state['channels']: - print(f"Channel {channel} already added") - else: - self.state['channels'][channel] = 0 - self.save_state() - print(f"āœ… Added channel {channel}") + + if selection.lower() == 'all': + for channel_info in channels_data: + channel_id = channel_info['channel_id'] + if channel_id not in self.state['channels']: + self.state['channels'][channel_id] = 0 + if 'channel_names' not in self.state: + self.state['channel_names'] = {} + self.state['channel_names'][channel_id] = channel_info['username'] + print(f"āœ… Added channel {channel_info['channel_name']} (ID: {channel_id})") added_count += 1 - - except ValueError: - print(f"Invalid input: {sel}") - + else: + print(f"Channel {channel_info['channel_name']} already added") + else: + for sel in [x.strip() for x in selection.split(',')]: + try: + if sel.startswith('-'): + channel_id = sel + channel_info = next((c for c in channels_data if c['channel_id'] == channel_id), None) + if not channel_info: + print(f"Channel ID {channel_id} not found") + continue + else: + num = int(sel) + if 1 <= num <= len(channels_data): + channel_info = channels_data[num - 1] + channel_id = channel_info['channel_id'] + else: + print(f"Invalid number: {num}. Choose 1-{len(channels_data)}") + continue + + if channel_id in self.state['channels']: + print(f"Channel {channel_info['channel_name']} already added") + else: + self.state['channels'][channel_id] = 0 + if 'channel_names' not in self.state: + self.state['channel_names'] = {} + self.state['channel_names'][channel_id] = channel_info['username'] + print(f"āœ… Added channel {channel_info['channel_name']} (ID: {channel_id})") + added_count += 1 + + except ValueError: + print(f"Invalid input: {sel}") + if added_count > 0: + self.save_state() print(f"\nšŸŽ‰ Added {added_count} new channel(s)!") await self.view_channels() else: