base on DotnetSpider, a .NET standard web crawling library. It is lightweight, efficient and fast high-level web crawling & scraping framework # DotnetSpider 免责申明:本框架是为了帮助开发人员简化开发流程、提高开发效率,请勿使用此框架做任何违法国家法律的事情,使用者所做任何事情也与本框架的作者无关。 [![Build Status](https://dev.azure.com/zlzforever/DotnetSpider/_apis/build/status/dotnetcore.DotnetSpider?branchName=master)](https://dev.azure.com/zlzforever/DotnetSpider/_build/latest?definitionId=3&branchName=master) [![NuGet](https://img.shields.io/nuget/vpre/DotnetSpider.svg)](https://www.nuget.org/packages/DotnetSpider) [![Member project of .NET Core Community](https://img.shields.io/badge/member%20project%20of-NCC-9e20c9.svg)](https://github.com/dotnetcore) [![GitHub license](https://img.shields.io/github/license/dotnetcore/DotnetSpider.svg)](https://github.com/dotnetcore/DotnetSpider/blob/master/LICENSE.txt) DotnetSpider, a .NET Standard web crawling library. It is a lightweight, efficient, and fast high-level web crawling & scraping framework. If you want to get the latest beta packages, you should add the myget feed: ````html <add key="myget.org" value="https://www.myget.org/F/zlzforever/api/v3/index.json" protocolVersion="3" /> ```` ### DESIGN ![DESIGN IMAGE](https://github.com/dotnetcore/DotnetSpider/blob/master/images/%E6%95%B0%E6%8D%AE%E9%87%87%E9%9B%86%E7%B3%BB%E7%BB%9F.png?raw=true) ### DEVELOP ENVIROMENT 1. Visual Studio 2017 (15.3 or later) or Jetbrains Rider 2. [.NET Core 2.2 or later](https://www.microsoft.com/net/download/windows) 3. Docker 4. MySql docker run --name mysql -d -p 3306:3306 --restart always -e MYSQL_ROOT_PASSWORD=1qazZAQ! mysql:5.7 5. Redis (option) docker run --name redis -d -p 6379:6379 --restart always redis 6. SqlServer docker run --name sqlserver -d -p 1433:1433 --restart always -e 'ACCEPT_EULA=Y' -e 'SA_PASSWORD=1qazZAQ!' mcr.microsoft.com/mssql/server:2017-latest 8. PostgreSQL (option) docker run --name postgres -d -p 5432:5432 --restart always -e POSTGRES_PASSWORD=1qazZAQ! postgres 9. MongoDb (option) docker run --name mongo -d -p 27017:27017 --restart always mongo 10. RabbitMQ docker run -d --restart always --name rabbimq -p 4369:4369 -p 5671-5672:5671-5672 -p 25672:25672 -p 15671-15672:15671-15672 \ -e RABBITMQ_DEFAULT_USER=user -e RABBITMQ_DEFAULT_PASS=password \ rabbitmq:3-management 11. Docker remote api for mac docker run -d --restart always --name socat -v /var/run/docker.sock:/var/run/docker.sock -p 2376:2375 bobrik/socat TCP4-LISTEN:2375,fork,reuseaddr UNIX-CONNECT:/var/run/docker.sock 12. HBase docker run -d --restart always --name hbase -p 20550:8080 -p 8085:8085 -p 9090:9090 -p 9095:9095 -p 16010:16010 dajobe/hbase ### MORE DOCUMENTS https://github.com/dotnetcore/DotnetSpider/wiki ### SAMPLES Please see the Project DotnetSpider.Sample in the solution. ### BASE USAGE [Base usage Codes](https://github.com/dotnetcore/DotnetSpider/blob/master/src/DotnetSpider.Sample/samples/BaseUsageSpider.cs) ### ADDITIONAL USAGE: Configurable Entity Spider [View complete Codes](https://github.com/dotnetcore/DotnetSpider/blob/master/src/DotnetSpider.Sample/samples/EntitySpider.cs) ````csharp [DisplayName("博客园爬虫")] public class EntitySpider( IOptions<SpiderOptions> options, DependenceServices services, ILogger<Spider> logger) : Spider(options, services, logger) { public static async Task RunAsync() { var builder = Builder.CreateDefaultBuilder<EntitySpider>(options => { options.Speed = 1; }); builder.UseSerilog(); builder.IgnoreServerCertificateError(); await builder.Build().RunAsync(); } protected override async Task InitializeAsync(CancellationToken stoppingToken = default) { AddDataFlow<DataParser<CnblogsEntry>>(); AddDataFlow(GetDefaultStorage); await AddRequestsAsync( new Request( "https://news.cnblogs.com/n/page/1", new Dictionary<string, object> { { "网站", "博客园" } })); } [Schema("cnblogs", "news")] [EntitySelector(Expression = ".//div[@class='news_block']", Type = SelectorType.XPath)] [GlobalValueSelector(Expression = ".//a[@class='current']", Name = "类别", Type = SelectorType.XPath)] [GlobalValueSelector(Expression = "//title", Name = "Title", Type = SelectorType.XPath)] [FollowRequestSelector(Expressions = ["//div[@class='pager']"])] public class CnblogsEntry : EntityBase<CnblogsEntry> { protected override void Configure() { HasIndex(x => x.Title); HasIndex(x => new { x.WebSite, x.Guid }, true); } public int Id { get; set; } [Required] [StringLength(200)] [ValueSelector(Expression = "类别", Type = SelectorType.Environment)] public string Category { get; set; } [Required] [StringLength(200)] [ValueSelector(Expression = "网站", Type = SelectorType.Environment)] public string WebSite { get; set; } [StringLength(200)] [ValueSelector(Expression = "Title", Type = SelectorType.Environment)] [ReplaceFormatter(NewValue = "", OldValue = " - 博客园")] public string Title { get; set; } [StringLength(40)] [ValueSelector(Expression = "GUID", Type = SelectorType.Environment)] public string Guid { get; set; } [ValueSelector(Expression = ".//h2[@class='news_entry']/a")] public string News { get; set; } [ValueSelector(Expression = ".//h2[@class='news_entry']/a/@href")] public string Url { get; set; } [ValueSelector(Expression = ".//div[@class='entry_summary']")] [TrimFormatter] public string PlainText { get; set; } [ValueSelector(Expression = "DATETIME", Type = SelectorType.Environment)] public DateTime CreationTime { get; set; } } } ```` #### Distributed spider [Read this document](https://github.com/dotnetcore/DotnetSpider/wiki/3-Distributed-Spider) #### Puppeteer downloader Coming soon ### NOTICE #### when you use redis scheduler, please update your redis config: timeout 0 tcp-keepalive 60 ### Dependencies | Package | License | | --- | --- | | Bert.RateLimiters | Apache 2.0 | | MessagePack | MIT | | Newtonsoft.Json | MIT | | Dapper | Apache 2.0 | | HtmlAgilityPack | MIT | | ZCJ.HashedWheelTimer | MIT | | murmurhash | Apache 2.0 | | Serilog.AspNetCore | Apache 2.0 | | Serilog.Sinks.Console | Apache 2.0 | | Serilog.Sinks.RollingFile | Apache 2.0 | | Serilog.Sinks.PeriodicBatching | Apache 2.0 | | MongoDB.Driver | Apache 2.0 | | MySqlConnector | MIT | | AutoMapper.Extensions.Microsoft.DependencyInjection | MIT | | Docker.DotNet | MIT | | BuildBundlerMinifier | Apache 2.0 | | Pomelo.EntityFrameworkCore.MySql | MIT | | Quartz.AspNetCore | Apache 2.0 | | Quartz.AspNetCore.MySqlConnector | Apache 2.0 | | Npgsql | PostgreSQL License | | RabbitMQ.Client | Apache 2.0 | | Polly | BSD 3-C | ### AREAS FOR IMPROVEMENTS QQ Group: 477731655 Email: [email protected] ", Assign "at most 3 tags" to the expected json: {"id":"6637","tags":[]} "only from the tags list I provide: [{"id":77,"name":"3d"},{"id":89,"name":"agent"},{"id":17,"name":"ai"},{"id":54,"name":"algorithm"},{"id":24,"name":"api"},{"id":44,"name":"authentication"},{"id":3,"name":"aws"},{"id":27,"name":"backend"},{"id":60,"name":"benchmark"},{"id":72,"name":"best-practices"},{"id":39,"name":"bitcoin"},{"id":37,"name":"blockchain"},{"id":1,"name":"blog"},{"id":45,"name":"bundler"},{"id":58,"name":"cache"},{"id":21,"name":"chat"},{"id":49,"name":"cicd"},{"id":4,"name":"cli"},{"id":64,"name":"cloud-native"},{"id":48,"name":"cms"},{"id":61,"name":"compiler"},{"id":68,"name":"containerization"},{"id":92,"name":"crm"},{"id":34,"name":"data"},{"id":47,"name":"database"},{"id":8,"name":"declarative-gui "},{"id":9,"name":"deploy-tool"},{"id":53,"name":"desktop-app"},{"id":6,"name":"dev-exp-lib"},{"id":59,"name":"dev-tool"},{"id":13,"name":"ecommerce"},{"id":26,"name":"editor"},{"id":66,"name":"emulator"},{"id":62,"name":"filesystem"},{"id":80,"name":"finance"},{"id":15,"name":"firmware"},{"id":73,"name":"for-fun"},{"id":2,"name":"framework"},{"id":11,"name":"frontend"},{"id":22,"name":"game"},{"id":81,"name":"game-engine "},{"id":23,"name":"graphql"},{"id":84,"name":"gui"},{"id":91,"name":"http"},{"id":5,"name":"http-client"},{"id":51,"name":"iac"},{"id":30,"name":"ide"},{"id":78,"name":"iot"},{"id":40,"name":"json"},{"id":83,"name":"julian"},{"id":38,"name":"k8s"},{"id":31,"name":"language"},{"id":10,"name":"learning-resource"},{"id":33,"name":"lib"},{"id":41,"name":"linter"},{"id":28,"name":"lms"},{"id":16,"name":"logging"},{"id":76,"name":"low-code"},{"id":90,"name":"message-queue"},{"id":42,"name":"mobile-app"},{"id":18,"name":"monitoring"},{"id":36,"name":"networking"},{"id":7,"name":"node-version"},{"id":55,"name":"nosql"},{"id":57,"name":"observability"},{"id":46,"name":"orm"},{"id":52,"name":"os"},{"id":14,"name":"parser"},{"id":74,"name":"react"},{"id":82,"name":"real-time"},{"id":56,"name":"robot"},{"id":65,"name":"runtime"},{"id":32,"name":"sdk"},{"id":71,"name":"search"},{"id":63,"name":"secrets"},{"id":25,"name":"security"},{"id":85,"name":"server"},{"id":86,"name":"serverless"},{"id":70,"name":"storage"},{"id":75,"name":"system-design"},{"id":79,"name":"terminal"},{"id":29,"name":"testing"},{"id":12,"name":"ui"},{"id":50,"name":"ux"},{"id":88,"name":"video"},{"id":20,"name":"web-app"},{"id":35,"name":"web-server"},{"id":43,"name":"webassembly"},{"id":69,"name":"workflow"},{"id":87,"name":"yaml"}]" returns me the "expected json"