| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220 |
- using Application.Abstractions.Data;
- using Application.Abstractions.News;
- using Domain.Entities.News;
- using Microsoft.EntityFrameworkCore;
- using Microsoft.Extensions.DependencyInjection;
- using Microsoft.Extensions.Hosting;
- using Microsoft.Extensions.Logging;
- using System.Text.Json;
- using System.Xml.Linq;
- using System.Text.RegularExpressions;
- namespace Infrastructure.News;
- public sealed class RssCollectorService(
- IServiceScopeFactory scopeFactory,
- IHttpClientFactory httpClientFactory,
- ILogger<RssCollectorService> logger
- ) : BackgroundService, IRssCollector
- {
- private static readonly XNamespace _dc = "http://purl.org/dc/elements/1.1/";
- private static readonly XNamespace _content = "http://purl.org/rss/1.0/modules/content/";
- private static readonly XNamespace _media = "http://search.yahoo.com/mrss/";
- private static readonly XNamespace _slash = "http://purl.org/rss/1.0/modules/slash/";
- protected override async Task ExecuteAsync(CancellationToken ct)
- {
- // 서비스 시작 직후 잠시 대기 (DB 준비)
- await Task.Delay(5000, ct);
- while (!ct.IsCancellationRequested)
- {
- try
- {
- await CollectAllFeedsAsync(ct);
- }
- catch (OperationCanceledException) when (ct.IsCancellationRequested)
- {
- break;
- }
- catch (Exception ex)
- {
- logger.LogError(ex, "RSS 수집 루프에서 예외 발생");
- }
- await Task.Delay(TimeSpan.FromMinutes(1), ct);
- }
- }
- public async Task<int> FetchSourceAsync(int sourceID, CancellationToken ct)
- {
- using var scope = scopeFactory.CreateScope();
- var db = scope.ServiceProvider.GetRequiredService<IAppDbContext>();
- var source = await db.RssFeedSource.FindAsync([sourceID], ct) ?? throw new KeyNotFoundException($"소스 ID {sourceID}를 찾을 수 없습니다.");
- var count = await FetchAndStoreAsync(source, db, ct);
- source.MarkFetched();
- await db.SaveChangesAsync(ct);
- return count;
- }
- private async Task CollectAllFeedsAsync(CancellationToken ct)
- {
- using var scope = scopeFactory.CreateScope();
- var db = scope.ServiceProvider.GetRequiredService<IAppDbContext>();
- // 초기 시드: 테이블이 비어있으면 기본 소스 삽입
- if (!await db.RssFeedSource.AnyAsync(ct))
- {
- await SeedDefaultSourcesAsync(db, ct);
- }
- var sources = await db.RssFeedSource.Where(x => x.IsActive).ToListAsync(ct);
- var now = DateTime.UtcNow;
- foreach (var source in sources)
- {
- // 수집 주기 체크
- if (source.LastFetchedAt.HasValue && (now - source.LastFetchedAt.Value).TotalMinutes < source.IntervalMinutes)
- {
- continue;
- }
- try
- {
- var count = await FetchAndStoreAsync(source, db, ct);
- source.MarkFetched();
- await db.SaveChangesAsync(ct);
- if (count > 0)
- {
- logger.LogInformation("[RSS] {SourceName}: {Count}건 수집 완료", source.Name, count);
- }
- }
- catch (Exception ex)
- {
- logger.LogWarning(ex, "[RSS] {SourceName} ({Url}) 수집 실패", source.Name, source.Url);
- }
- }
- }
- private async Task<int> FetchAndStoreAsync(RssFeedSource source, IAppDbContext db, CancellationToken ct)
- {
- var client = httpClientFactory.CreateClient("RssFeed");
- var xml = await client.GetStringAsync(source.Url, ct);
- var doc = XDocument.Parse(xml);
- var items = doc.Descendants("item").ToList();
- if (items.Count == 0)
- {
- return 0;
- }
- // 파싱
- var articles = items.Select(item => ParseItem(source.ID, item)).ToList();
- // 중복 체크: 기존 Guid 조회
- var guids = articles.Where(a => a.Guid != null).Select(a => a.Guid!).ToList();
- var existingGuids = await db.RssNewsArticle
- .Where(x => x.RssFeedSourceID == source.ID && x.Guid != null && guids.Contains(x.Guid!))
- .Select(x => x.Guid!)
- .ToListAsync(ct);
- var existingSet = existingGuids.ToHashSet();
- var newArticles = articles.Where(a => a.Guid == null || !existingSet.Contains(a.Guid)).ToList();
- if (newArticles.Count == 0)
- {
- return 0;
- }
- db.RssNewsArticle.AddRange(newArticles);
- await db.SaveChangesAsync(ct);
- return newArticles.Count;
- }
- private static RssNewsArticle ParseItem(int sourceID, XElement item)
- {
- var title = item.Element("title")?.Value?.Trim() ?? "";
- var link = item.Element("link")?.Value?.Trim();
- var guid = item.Element("guid")?.Value?.Trim() ?? link;
- var author = item.Element(_dc + "creator")?.Value?.Trim();
- var description = item.Element("description")?.Value?.Trim();
- var content = item.Element(_content + "encoded")?.Value?.Trim();
- var sourceName = item.Element("source")?.Value?.Trim();
- // Author fallback: Google News의 source 요소
- if (string.IsNullOrEmpty(author) && !string.IsNullOrEmpty(sourceName))
- {
- author = sourceName;
- }
- // 이미지: media:content > enclosure > description/content 내 첫 번째 <img>
- var imageUrl = item.Element(_media + "content")?.Attribute("url")?.Value?.Trim()
- ?? item.Element("enclosure")?.Attribute("url")?.Value?.Trim();
- if (string.IsNullOrEmpty(imageUrl))
- {
- imageUrl = ExtractFirstImageUrl(description) ?? ExtractFirstImageUrl(content);
- }
- // 카테고리: JSON 배열로 저장
- var categories = item.Elements("category").Select(c => c.Value.Trim()).Where(c => !string.IsNullOrEmpty(c)).ToList();
- string? categoriesJson = categories.Count > 0 ? JsonSerializer.Serialize(categories) : null;
- // 댓글 수
- var commentCountStr = item.Element(_slash + "comments")?.Value;
- int.TryParse(commentCountStr, out var commentCount);
- // 발행일
- DateTime? publishedAt = null;
- var pubDateStr = item.Element("pubDate")?.Value;
- if (!string.IsNullOrEmpty(pubDateStr) && DateTimeOffset.TryParse(pubDateStr, out var dto))
- {
- publishedAt = dto.UtcDateTime;
- }
- return RssNewsArticle.Create(
- rssFeedSourceID: sourceID,
- title: title,
- link: link,
- guid: guid,
- author: author,
- description: description,
- content: content,
- imageUrl: imageUrl,
- sourceName: sourceName,
- categories: categoriesJson,
- commentCount: commentCount,
- publishedAt: publishedAt
- );
- }
- private static readonly Regex _imgSrcRegex = new(@"<img[^>]+src\s*=\s*[""']([^""']+)[""']", RegexOptions.IgnoreCase | RegexOptions.Compiled);
- private static string? ExtractFirstImageUrl(string? html)
- {
- if (string.IsNullOrEmpty(html))
- {
- return null;
- }
- var match = _imgSrcRegex.Match(html);
- return match.Success ? match.Groups[1].Value.Trim() : null;
- }
- private static async Task SeedDefaultSourcesAsync(IAppDbContext db, CancellationToken ct)
- {
- db.RssFeedSource.AddRange(
- RssFeedSource.Create("CoinSpeaker KR", "https://www.coinspeaker.com/kr/news/feed/", "CoinSpeaker 한국어 뉴스", 10),
- RssFeedSource.Create("Google News 비트코인", "https://news.google.com/rss/search?q=%EB%B9%84%ED%8A%B8%EC%BD%94%EC%9D%B8&hl=ko&gl=KR&ceid=KR:ko", "구글 뉴스 비트코인 검색", 15),
- RssFeedSource.Create("CoinTelegraph KR", "https://cointelegraph-kr.com/rss", "CoinTelegraph 한국어 뉴스", 10)
- );
- await db.SaveChangesAsync(ct);
- }
- }
|